[PATCH 1 of 2] parsers: incrementally parse the revlog index in C

Thu Mar 29 14:51:09 CDT 2012

On Thu, 2012-03-22 at 11:44 -0700, Bryan O'Sullivan wrote:

> +index_ass_subscript(indexObject* self, PyObject* item, PyObject* value)

I'm not sure what that abbreviation means here. Perhaps something like
this? http://t.co/4CQiLKim

> +/*
> + * returns a tuple of the form (index, None, cache) with elements as
> + * follows:
>   *
> - * index: a list of tuples containing the RevlogNG records
> - * cache: if data is inlined, a tuple (index_file_content, 0) else None
> + * index: an index object that lazily parses the RevlogNG records
> + * cache: if data is inlined, a tuple (index_file_content, 0), else None
> + *
> + * added complications are for backwards compatibility
>   */

I'm afraid this isn't actually backward compatible. If I update to this
changeset without recompiling, I get an exception:

 ValueError: need more than 2 values to unpack

If I pop the changeset without recompiling, I get this:

 abort: index 00changelog.i is corrupted!

..which is a big headache for bisecting.

But otherwise, this is looking pretty good. I spent a while kicking the
tires on this last night and trying to optimize it. Some observations:

- quick index loads for large indexes are, as predicted, massively
faster
- retrieving individual entries after load is 5x SLOWER
- about 99% of that time is spent in Py_BuildValue, so we've simply
moved that cost from load time to lookup time
- which means there's basically nothing that can be optimized in
index_get
- perfheads is about twice as fast when index load time is included
- verify is a few percent slower, presumably because it builds the same
entries repeatedly

So I think there are two ways we can go in the future to make this
faster:

- cache each entry we build
- treat the index as a tuple of lists rather than a list of tuples

The latter idea involves two observations. First, an operation like
heads() only uses two elements out of each retrieved entry, so most of
the work building the entry tuple is wasted. Second, having a separate
list per property does away with one tuple allocation per entry as well
as all the extra overhead of parsing a value description.

>  static PyObject *parse_index2(PyObject *self, PyObject *args)
>  {
>  	const char *data;
> -	int size, inlined;
> -	PyObject *rval = NULL, *index = NULL, *cache = NULL;
> -	PyObject *data_obj = NULL, *inlined_obj;
> +	int size, ret;
> +	PyObject *inlined_obj, *tuple = NULL, *cache = NULL, *none = NULL;
> +	indexObject *idx;
>  
>  	if (!PyArg_ParseTuple(args, "s#O", &data, &size, &inlined_obj))
>  		return NULL;
> -	inlined = inlined_obj && PyObject_IsTrue(inlined_obj);
>  
> -	/* If no data is inlined, we know the size of the index list in
> -	 * advance: size divided by the size of one revlog record (64 bytes)
> -	 * plus one for nullid */
> -	index = inlined ? PyList_New(0) : PyList_New(size / 64 + 1);
> -	if (!index)
> -		goto quit;
> +	idx = PyObject_New(indexObject, &indexType);
>  
> -	/* set up the cache return value */
> -	if (inlined) {
> -		/* Note that the reference to data_obj is only borrowed */
> -		data_obj = PyTuple_GET_ITEM(args, 0);
> -		cache = Py_BuildValue("iO", 0, data_obj);
> -		if (!cache)
> -			goto quit;
> +	if (idx == NULL)
> +		goto bail;
> +
> +	ret = index_real_init(idx, data, size, inlined_obj,
> +			      PyTuple_GET_ITEM(args, 0));
> +	if (ret)
> +		goto bail;
> +
> +	if (idx->inlined) {
> +		Py_INCREF(idx->data);
> +		cache = Py_BuildValue("iO", 0, idx->data);
> +		if (cache == NULL)
> +			goto bail;
>  	} else {
>  		cache = Py_None;
> -		Py_INCREF(Py_None);
> +		Py_INCREF(cache);
>  	}
>  
> -	/* actually populate the index with data */
> -	if (!_parse_index_ng(data, size, inlined, index))
> -		goto quit;
> +	none = Py_None;
> +	Py_INCREF(none);
>  
> -	rval = Py_BuildValue("NN", index, cache);
> -	if (!rval)
> -		goto quit;
> -	return rval;
> +	tuple = Py_BuildValue("NNN", idx, none, cache);
> +	if (!tuple)
> +		goto bail;
> +	return tuple;
>  
> -quit:
> -	Py_XDECREF(index);
> +bail:
> +	Py_XDECREF(idx);
>  	Py_XDECREF(cache);
> -	Py_XDECREF(rval);
> +	Py_XDECREF(none);
> +	Py_XDECREF(tuple);
>  	return NULL;
>  }
>  
> -
>  static char parsers_doc[] = "Efficient content parsing.";
>  
>  static PyMethodDef methods[] = {
> @@ -396,6 +663,22 @@
>  	{NULL, NULL}
>  };
>  
> +static void module_init(PyObject *mod)
> +{
> +	static const char nullid[20];
> +
> +	if (PyType_Ready(&indexType) < 0)
> +		return;
> +	Py_INCREF(&indexType);
> +
> +	PyModule_AddObject(mod, "index", (PyObject *)&indexType);
> +
> +	nullentry = Py_BuildValue("iiiiiiis#", 0, 0, 0,
> +				  -1, -1, -1, -1, nullid, 20);
> +	if (nullentry)
> +		PyObject_GC_UnTrack(nullentry);
> +}
> +
>  #ifdef IS_PY3K
>  static struct PyModuleDef parsers_module = {
>  	PyModuleDef_HEAD_INIT,
> @@ -407,12 +690,15 @@
>  
>  PyMODINIT_FUNC PyInit_parsers(void)
>  {
> -	return PyModule_Create(&parsers_module);
> +	PyObject *mod = PyModule_Create(&parsers_module);
> +	module_init(mod);
> +	return mod;
>  }
>  #else
>  PyMODINIT_FUNC initparsers(void)
>  {
> -	Py_InitModule3("parsers", methods, parsers_doc);
> +	PyObject *mod = Py_InitModule3("parsers", methods, parsers_doc);
> +	module_init(mod);
>  }
>  #endif
>  
> diff --git a/mercurial/revlog.py b/mercurial/revlog.py
> --- a/mercurial/revlog.py
> +++ b/mercurial/revlog.py
> @@ -171,10 +171,7 @@
>      def __init__(self):
>          self.size = struct.calcsize(indexformatng)
>  
> -    def parseindex(self, data, inline):
> -        # call the C implementation to parse the index data
> -        index, cache = parsers.parse_index2(data, inline)
> -        return index, None, cache
> +    parseindex = staticmethod(parsers.parse_index2)
>  
>      def packentry(self, entry, node, version, rev):
>          p = _pack(indexformatng, *entry)
> diff --git a/tests/test-parseindex2.py b/tests/test-parseindex2.py
> --- a/tests/test-parseindex2.py
> +++ b/tests/test-parseindex2.py
> @@ -52,7 +52,6 @@
>  
>      return index, cache
>  
> -
>  data_inlined = '\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x01\x8c' \
>      '\x00\x00\x04\x07\x00\x00\x00\x00\x00\x00\x15\x15\xff\xff\xff' \
>      '\xff\xff\xff\xff\xff\xebG\x97\xb7\x1fB\x04\xcf\x13V\x81\tw\x1b' \
> @@ -94,13 +93,16 @@
>      '\xb6\r\x98B\xcb\x07\xbd`\x8f\x92\xd9\xc4\x84\xbdK\x00\x00\x00' \
>      '\x00\x00\x00\x00\x00\x00\x00\x00\x00'
>  
> +def parse_index2(data, inline):
> +    index, alwaysnone, chunkcache = parsers.parse_index2(data, inline)
> +    return list(index), chunkcache
> +
>  def runtest() :
> -
>      py_res_1 = py_parseindex(data_inlined, True)
> -    c_res_1 = parsers.parse_index2(data_inlined, True)
> +    c_res_1 = parse_index2(data_inlined, True)
>  
>      py_res_2 = py_parseindex(data_non_inlined, False)
> -    c_res_2 = parsers.parse_index2(data_non_inlined, False)
> +    c_res_2 = parse_index2(data_non_inlined, False)
>  
>      if py_res_1 != c_res_1:
>          print "Parse index result (with inlined data) differs!"

-- 
Mathematics is the supreme nostalgia of our time.