[PATCH RFC] parsers: add a C function to pack the dirstate

Adrian Buehlmann adrian at cadifra.com
Wed Jun 6 18:38:57 CDT 2012


On 2012-06-06 19:38, Bryan O'Sullivan wrote:
> # HG changeset patch
> # User Bryan O'Sullivan <bryano at fb.com>
> # Date 1339003585 25200
> # Node ID 819f18577783fcd78e8e68b79e1d79d22a636b3f
> # Parent  025e329b1ee8f4031663024b1cd1b68d575c55de
> parsers: add a C function to pack the dirstate
> 
> This is about 9 times faster than the Python dirstate packing code.
> The relatively small speedup is due to the poor locality and memory
> access patterns caused by traversing dicts and other boxed Python
> values.

Thank you for working on making Mercurial faster.

You have now resent this patch a couple of times. It is a very difficult
to review patch.

At one point, I tried a version of that patch and found an issue, which
you have fixed.

The question is: what have you changed for this version?

I had started to look a bit at an earlier patch mail, and tried to see
if I find something obvious. But I failed. Which means I couldn't find a
defect by looking at the patch. Given that I'm not that qualified, this
probably doesn't mean much. So I decided to not respond.

So, basically, this looks good to me, but IMHO it is risky, due to the
sheer amount of C code. If it is worth the risk, then I'm fine with
taking the risk.

The thing with the "now" time appeared correct to me, last time I looked
at a version of this patch (I did spend quite a lot of time thinking
about that "now" thing in the python code at some point in the past and
contributed patches around that corner).

And for the record: It was also your patches that motivated me to help
improve the testsuite on Windows. In the past, there weren't that many
changes to the C parts of mercurial.

> diff --git a/mercurial/dirstate.py b/mercurial/dirstate.py
> --- a/mercurial/dirstate.py
> +++ b/mercurial/dirstate.py
> @@ -498,12 +498,24 @@
>              return
>          st = self._opener("dirstate", "w", atomictemp=True)
>  
> +        def finish(s):
> +            st.write(s)
> +            st.close()
> +            self._lastnormaltime = 0
> +            self._dirty = self._dirtypl = False
> +
>          # use the modification time of the newly created temporary file as the
>          # filesystem's notion of 'now'
> -        now = int(util.fstat(st).st_mtime)
> +        now = util.fstat(st).st_mtime
> +        copymap = self._copymap
> +        try:
> +            finish(parsers.pack_dirstate(self._map, copymap, self._pl, now))
> +            return
> +        except AttributeError:
> +            pass
>  
> +        now = int(now)
>          cs = cStringIO.StringIO()
> -        copymap = self._copymap
>          pack = struct.pack
>          write = cs.write
>          write("".join(self._pl))
> @@ -526,10 +538,7 @@
>              e = pack(_format, e[0], e[1], e[2], e[3], len(f))
>              write(e)
>              write(f)
> -        st.write(cs.getvalue())
> -        st.close()
> -        self._lastnormaltime = 0
> -        self._dirty = self._dirtypl = False
> +        finish(cs.getvalue())
>  
>      def _dirignore(self, f):
>          if f == '.':
> diff --git a/mercurial/parsers.c b/mercurial/parsers.c
> --- a/mercurial/parsers.c
> +++ b/mercurial/parsers.c
> @@ -214,6 +214,154 @@
>  	return ret;
>  }
>  
> +static inline int getintat(PyObject *tuple, int off, uint32_t *v)
> +{
> +	PyObject *o = PyTuple_GET_ITEM(tuple, off);
> +	long val;
> +
> +	if (PyInt_Check(o))
> +		val = PyInt_AS_LONG(o);
> +	else if (PyLong_Check(o)) {
> +		val = PyLong_AsLong(o);
> +		if (val == -1 && PyErr_Occurred())
> +			return -1;
> +	} else {
> +		PyErr_SetString(PyExc_TypeError, "expected an int or long");
> +		return -1;
> +	}
> +	if (LONG_MAX > INT_MAX && (val > INT_MAX || val < INT_MIN)) {
> +		PyErr_SetString(PyExc_OverflowError,
> +				"Python value to large to convert to uint32_t");
> +		return -1;
> +	}
> +	*v = (uint32_t)val;
> +	return 0;
> +}
> +
> +static PyObject *dirstate_unset;
> +
> +/*
> + * Efficiently pack a dirstate object into its on-disk format.
> + */
> +static PyObject *pack_dirstate(PyObject *self, PyObject *args)
> +{
> +	PyObject *packobj = NULL;
> +	PyObject *map, *copymap, *pl;
> +	Py_ssize_t nbytes, pos, l;
> +	PyObject *k, *v, *pn;
> +	char *p, *s;
> +	double now;
> +
> +	if (!PyArg_ParseTuple(args, "O!O!Od:pack_dirstate",
> +			      &PyDict_Type, &map, &PyDict_Type, &copymap,
> +			      &pl, &now))
> +		return NULL;
> +
> +	if (!PySequence_Check(pl) || PySequence_Size(pl) != 2) {
> +		PyErr_SetString(PyExc_TypeError, "expected 2-element sequence");
> +		return NULL;
> +	}
> +
> +	/* Figure out how much we need to allocate. */
> +	for (nbytes = 40, pos = 0; PyDict_Next(map, &pos, &k, &v);) {
> +		PyObject *c;
> +		if (!PyString_Check(k)) {
> +			PyErr_SetString(PyExc_TypeError, "expected string key");
> +			goto bail;
> +		}
> +		nbytes += PyString_GET_SIZE(k) + 17;
> +		c = PyDict_GetItem(copymap, k);
> +		if (c) {
> +			if (!PyString_Check(c)) {
> +				PyErr_SetString(PyExc_TypeError,
> +						"expected string key");
> +				goto bail;
> +			}
> +			nbytes += PyString_GET_SIZE(c) + 1;
> +		}
> +	}
> +
> +	packobj = PyString_FromStringAndSize(NULL, nbytes);
> +	if (packobj == NULL)
> +		goto bail;
> +
> +	p = PyString_AS_STRING(packobj);
> +
> +	pn = PySequence_ITEM(pl, 0);
> +	if (PyString_AsStringAndSize(pn, &s, &l) == -1 || l != 20) {
> +		PyErr_SetString(PyExc_TypeError, "expected a 20-byte hash");
> +		goto bail;
> +	}
> +	memcpy(p, s, l);
> +	p += 20;
> +	pn = PySequence_ITEM(pl, 1);
> +	if (PyString_AsStringAndSize(pn, &s, &l) == -1 || l != 20) {
> +		PyErr_SetString(PyExc_TypeError, "expected a 20-byte hash");
> +		goto bail;
> +	}
> +	memcpy(p, s, l);
> +	p += 20;
> +
> +	for (pos = 0; PyDict_Next(map, &pos, &k, &v); ) {
> +		uint32_t mode, size, mtime;
> +		Py_ssize_t len, l;
> +		PyObject *o;
> +		char *s, *t;
> +		int err;
> +
> +		if (!PyTuple_Check(v) || PyTuple_GET_SIZE(v) != 4) {
> +			PyErr_SetString(PyExc_TypeError, "expected a 4-tuple");
> +			goto bail;
> +		}
> +		o = PyTuple_GET_ITEM(v, 0);
> +		if (PyString_AsStringAndSize(o, &s, &l) == -1 || l != 1) {
> +			PyErr_SetString(PyExc_TypeError, "expected one byte");
> +			goto bail;
> +		}
> +		*p++ = *s;
> +		err = getintat(v, 1, &mode);
> +		err |= getintat(v, 2, &size);
> +		err |= getintat(v, 3, &mtime);
> +		if (err)
> +			goto bail;
> +		if (*s == 'n' && mtime == (uint32_t)now) {
> +			/* See dirstate.py:write for why we do this. */
> +			if (PyDict_SetItem(map, k, dirstate_unset) == -1)
> +				goto bail;
> +			mode = 0, size = -1, mtime = -1;
> +		}
> +		putbe32(mode, p);
> +		putbe32(size, p + 4);
> +		putbe32(mtime, p + 8);
> +		t = p + 12;
> +		p += 16;
> +		len = PyString_GET_SIZE(k);
> +		memcpy(p, PyString_AS_STRING(k), len);
> +		p += len;
> +		o = PyDict_GetItem(copymap, k);
> +		if (o) {
> +			*p++ = '\0';
> +			l = PyString_GET_SIZE(o);
> +			memcpy(p, PyString_AS_STRING(o), l);
> +			p += l;
> +			len += l + 1;
> +		}
> +		putbe32((uint32_t)len, t);
> +	}
> +
> +	pos = p - PyString_AS_STRING(packobj);
> +	if (pos != nbytes) {
> +		PyErr_Format(PyExc_SystemError, "bad dirstate size: %ld != %ld",
> +                             (long)pos, (long)nbytes);
> +		goto bail;
> +	}
> +
> +	return packobj;
> +bail:
> +	Py_XDECREF(packobj);
> +	return NULL;
> +}
> +
>  /*
>   * A base-16 trie for fast node->rev mapping.
>   *
> @@ -1356,6 +1504,7 @@
>  static char parsers_doc[] = "Efficient content parsing.";
>  
>  static PyMethodDef methods[] = {
> +	{"pack_dirstate", pack_dirstate, METH_VARARGS, "pack a dirstate\n"},
>  	{"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"},
>  	{"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"},
>  	{"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
> @@ -1375,6 +1524,8 @@
>  				  -1, -1, -1, -1, nullid, 20);
>  	if (nullentry)
>  		PyObject_GC_UnTrack(nullentry);
> +
> +	dirstate_unset = Py_BuildValue("ciii", 'n', 0, -1, -1);
>  }
>  
>  #ifdef IS_PY3K
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
> 
> 


More information about the Mercurial-devel mailing list