[PATCH] parsers: use base-16 trie for faster node->rev mapping

Bryan O'Sullivan bos at serpentine.com
Fri Apr 6 02:37:26 CDT 2012


# HG changeset patch
# User Bryan O'Sullivan <bryano at fb.com>
# Date 1333697739 25200
# Branch stable
# Node ID b7301b69b4e390d61068902292ec108fbbec9337
# Parent  d57ad50ac22c51b884f73712edebee3269c64cf6
parsers: use base-16 trie for faster node->rev mapping

This greatly speeds up node->rev lookups in most cases: "hg --time
log" of rev 1000 on a linux-2.6 repo improves from 0.27 seconds to
0.08.

The new perfnodelookup command in contrib/perf.py demonstrates the
speedup more dramatically, since it performs no I/O.  For a single
lookup, the new code is about 40x faster.

These changes also prepare the ground for the possibility of further
improving the performance of prefix-based node lookups.

(Almost everything is faster, but not absolutely everything: "hg
verify" runs about 2% slower, and I don't yet know why.)

diff --git a/contrib/perf.py b/contrib/perf.py
--- a/contrib/perf.py
+++ b/contrib/perf.py
@@ -104,6 +104,16 @@
 def perflookup(ui, repo, rev):
     timer(lambda: len(repo.lookup(rev)))
 
+def perfnodelookup(ui, repo, rev):
+    import mercurial.revlog
+    mercurial.revlog._prereadsize = 2**24 # disable lazy parser in old hg
+    n = repo[rev].node()
+    cl = mercurial.revlog.revlog(repo.sopener, "00changelog.i")
+    def d():
+        cl.rev(n)
+        cl.clearcaches()
+    timer(d)
+
 def perflog(ui, repo, **opts):
     ui.pushbuffer()
     timer(lambda: commands.log(ui, repo, rev=[], date='', user='',
@@ -146,6 +156,7 @@
 
 cmdtable = {
     'perflookup': (perflookup, []),
+    'perfnodelookup': (perfnodelookup, []),
     'perfparents': (perfparents, []),
     'perfstartup': (perfstartup, []),
     'perfstatus': (perfstatus, []),
diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c
@@ -242,10 +242,27 @@
 }
 
 /*
- * A list-like object that decodes the contents of a RevlogNG index
- * file on demand. It has limited support for insert and delete at the
- * last element before the end.  The last entry is always a sentinel
- * nullid.
+ * A base-16 trie for fast node->rev mapping.
+ *
+ * Positive value is index of the next node in the trie
+ * Negative value is a leaf: -(rev + 1)
+ * Zero is empty
+ */
+typedef struct {
+	int children[16];
+} nodetree;
+
+/*
+ * This class has two behaviours.
+ *
+ * When used in a list-like way (with integer keys), we decode an
+ * entry in a RevlogNG index file on demand. Our last entry is a
+ * sentinel, always a nullid.  We have limited support for
+ * integer-keyed insert and delete, only at elements right before the
+ * sentinel.
+ *
+ * With string keys, we lazily perform a reverse mapping from node to
+ * rev, using a base-16 trie.
  */
 typedef struct {
 	PyObject_HEAD
@@ -256,10 +273,17 @@
 	Py_ssize_t raw_length;	/* original number of elements */
 	Py_ssize_t length;	/* current number of elements */
 	PyObject *added;	/* populated on demand */
-	int inlined;
+	nodetree *nt;
+	int ntlength;		/* # nodes in use */
+	int ntcapacity;		/* # nodes allocated */
+	int ntdepth;		/* maximum depth of tree */
+	int ntsplits;		/* # splits performed */
+	int ntrev;		/* last rev scanned */
+	int ntuses;		/* # accesses (capped) */
+	int inlined;		/* bool: index contains inline data */
 } indexObject;
 
-static Py_ssize_t index_length(indexObject *self)
+static Py_ssize_t index_length(const indexObject *self)
 {
 	if (self->added == NULL)
 		return self->length;
@@ -267,6 +291,7 @@
 }
 
 static PyObject *nullentry;
+static const char nullid[20];
 
 static long inline_scan(indexObject *self, const char **offsets);
 
@@ -276,7 +301,27 @@
 static const char *tuple_format = "kiiiiiis#";
 #endif
 
-/* RevlogNG format (all in big endian, data may be inlined):
+/*
+ * Return a pointer to the beginning of a RevlogNG record.
+ */
+static const char *index_deref(indexObject *self, Py_ssize_t pos)
+{
+	if (self->inlined && pos > 0) {
+		if (self->offsets == NULL) {
+			self->offsets = malloc(self->raw_length *
+					       sizeof(*self->offsets));
+			if (self->offsets == NULL)
+				return (const char *)PyErr_NoMemory();
+			inline_scan(self, self->offsets);
+		}
+		return self->offsets[pos];
+	}
+
+	return PyString_AS_STRING(self->data) + pos * 64;
+}
+
+/*
+ * RevlogNG format (all in big endian, data may be inlined):
  *    6 bytes: offset
  *    2 bytes: flags
  *    4 bytes: compressed length
@@ -297,7 +342,10 @@
 	Py_ssize_t length = index_length(self);
 	PyObject *entry;
 
-	if (pos >= length) {
+	if (pos < 0)
+		pos += length;
+
+	if (pos < 0 || pos >= length) {
 		PyErr_SetString(PyExc_IndexError, "revlog index out of range");
 		return NULL;
 	}
@@ -325,17 +373,9 @@
 			return PyErr_NoMemory();
 	}
 
-	if (self->inlined && pos > 0) {
-		if (self->offsets == NULL) {
-			self->offsets = malloc(self->raw_length *
-					       sizeof(*self->offsets));
-			if (self->offsets == NULL)
-				return PyErr_NoMemory();
-			inline_scan(self, self->offsets);
-		}
-		data = self->offsets[pos];
-	} else
-		data = PyString_AS_STRING(self->data) + pos * 64;
+	data = index_deref(self, pos);
+	if (data == NULL)
+		return NULL;
 
 	memcpy(decode, data, 8 * sizeof(uint32_t));
 
@@ -343,7 +383,7 @@
 	if (pos == 0) /* mask out version number for the first entry */
 		offset_flags &= 0xFFFF;
 	else {
-		uint32_t offset_high =	ntohl(decode[0]);
+		uint32_t offset_high = ntohl(decode[0]);
 		offset_flags |= ((uint64_t)offset_high) << 32;
 	}
 
@@ -368,26 +408,60 @@
 	return entry;
 }
 
+/*
+ * Return the 20-byte SHA of the node corresponding to the given rev.
+ */
+static const char *index_node(indexObject *self, Py_ssize_t pos)
+{
+	Py_ssize_t length = index_length(self);
+	const char *data;
+
+	if (pos == length - 1)
+		return nullid;
+
+	if (pos >= length)
+		return NULL;
+
+	if (pos >= self->length - 1) {
+		PyObject *tuple, *str;
+		tuple = PyList_GET_ITEM(self->added, pos - self->length + 1);
+		str = PyTuple_GetItem(tuple, 7);
+		return str ? PyString_AS_STRING(str) : NULL;
+	}
+
+	data = index_deref(self, pos);
+	return data ? data + 32 : NULL;
+}
+
+static int nt_insert(indexObject *self, const char *node, int rev);
+
+static int node_check(PyObject *obj, char **node, Py_ssize_t *nodelen)
+{
+	if (PyString_AsStringAndSize(obj, node, nodelen) == -1)
+		return -1;
+	if (*nodelen == 20)
+		return 0;
+	PyErr_SetString(PyExc_ValueError, "20-byte hash required");
+	return -1;
+}
+
 static PyObject *index_insert(indexObject *self, PyObject *args)
 {
-	PyObject *obj, *node;
+	PyObject *obj;
+	char *node;
 	long offset;
-	Py_ssize_t len;
+	Py_ssize_t len, nodelen;
 
 	if (!PyArg_ParseTuple(args, "lO", &offset, &obj))
 		return NULL;
 
 	if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != 8) {
-		PyErr_SetString(PyExc_ValueError, "8-tuple required");
+		PyErr_SetString(PyExc_TypeError, "8-tuple required");
 		return NULL;
 	}
 
-	node = PyTuple_GET_ITEM(obj, 7);
-	if (!PyString_Check(node) || PyString_GET_SIZE(node) != 20) {
-		PyErr_SetString(PyExc_ValueError,
-				"20-byte hash required as last element");
+	if (node_check(PyTuple_GET_ITEM(obj, 7), &node, &nodelen) == -1)
 		return NULL;
-	}
 
 	len = index_length(self);
 
@@ -400,6 +474,12 @@
 		return NULL;
 	}
 
+	if (offset > INT_MAX) {
+		PyErr_SetString(PyExc_ValueError,
+				"currently only 2**31 revs supported");
+		return NULL;
+	}
+
 	if (self->added == NULL) {
 		self->added = PyList_New(0);
 		if (self->added == NULL)
@@ -409,6 +489,9 @@
 	if (PyList_Append(self->added, obj) == -1)
 		return NULL;
 
+	if (self->nt)
+		nt_insert(self, node, (int) offset);
+
 	Py_RETURN_NONE;
 }
 
@@ -428,26 +511,355 @@
 		free(self->offsets);
 		self->offsets = NULL;
 	}
+	if (self->nt) {
+		free(self->nt);
+		self->nt = NULL;
+	}
 }
 
 static PyObject *index_clearcaches(indexObject *self)
 {
 	_index_clearcaches(self);
+	self->ntlength = self->ntcapacity = 0;
+	self->ntdepth = self->ntsplits = 0;
+	self->ntrev = -1;
+	self->ntuses = 0;
 	Py_RETURN_NONE;
 }
 
-static int index_assign_subscript(indexObject* self, PyObject* item,
-				  PyObject* value)
+static PyObject *index_stats(indexObject *self)
+{
+	PyObject *obj = PyDict_New();
+
+	if (obj == NULL)
+		return NULL;
+
+#define istat(__n, __d) \
+	if (PyDict_SetItemString(obj, __d, PyInt_FromLong(self->__n)) == -1) \
+		goto bail;
+
+	if (self->added) {
+		Py_ssize_t len = PyList_GET_SIZE(self->added);
+		if (PyDict_SetItemString(obj, "index entries added",
+					 PyInt_FromLong(len)) == -1)
+			goto bail;
+	}
+
+	if (self->raw_length != self->length - 1)
+		istat(raw_length, "revs on disk");
+	istat(length, "revs in memory");
+	istat(ntlength, "node trie count");
+	istat(ntcapacity, "node trie capacity");
+	istat(ntdepth, "node trie depth");
+	istat(ntsplits, "node trie splits");
+	istat(ntrev, "node trie last rev scanned");
+
+#undef istat
+
+	return obj;
+
+bail:
+	Py_XDECREF(obj);
+	return NULL;
+}
+
+static inline int nt_level(const char *node, int level)
+{
+	int v = node[level>>1];
+	if (!(level & 1))
+		v >>= 4;
+	return v & 0xf;
+}
+
+static int nt_find(indexObject *self, const char *node, Py_ssize_t nodelen)
+{
+	int level, off;
+
+	if (nodelen == 20 && node[0] == '\0' && memcmp(node, nullid, 20) == 0)
+		return -1;
+
+	if (self->nt == NULL)
+		return -2;
+
+	for (level = off = 0; level < nodelen; level++) {
+		int k = nt_level(node, level);
+		nodetree *n = &self->nt[off];
+		int v = n->children[k];
+
+		if (v < 0) {
+			const char *n;
+			v = -v - 1;
+			n = index_node(self, v);
+			if (n == NULL)
+				return -2;
+			return memcmp(node, n, nodelen > 20 ? 20 : nodelen)
+				? -2 : v;
+		}
+		if (v == 0)
+			return -2;
+		off = v;
+	}
+	return -2;
+}
+
+static int nt_new(indexObject *self)
+{
+	if (self->ntlength == self->ntcapacity) {
+		self->ntcapacity *= 2;
+		self->nt = realloc(self->nt,
+				   self->ntcapacity * sizeof(nodetree));
+		if (self->nt == NULL) {
+			PyErr_SetString(PyExc_MemoryError, "out of memory");
+			return -1;
+		}
+		memset(&self->nt[self->ntlength], 0,
+		       sizeof(nodetree) * (self->ntcapacity - self->ntlength));
+	}
+	return self->ntlength++;
+}
+
+static int nt_insert(indexObject *self, const char *node, int rev)
+{
+	int level = 0;
+	int off = 0;
+
+	while (level < 20) {
+		int k = nt_level(node, level);
+		nodetree *n;
+		int v;
+
+		n = &self->nt[off];
+		v = n->children[k];
+
+		if (v == 0) {
+			n->children[k] = -rev - 1;
+			return 0;
+		}
+		if (v < 0) {
+			const char *oldnode = index_node(self, -v - 1);
+			int noff;
+
+			if (oldnode == NULL) {
+				/* node has been deleted from array */
+				n->children[k] = -rev - 1;
+				return 0;
+			}
+			if (memcmp(oldnode, node, 20) == 0)
+				return 0;
+			noff = nt_new(self);
+			if (noff == -1)
+				return -1;
+			/* self->nt may have been changed by realloc */
+			self->nt[off].children[k] = noff;
+			off = noff;
+			n = &self->nt[off];
+			n->children[nt_level(oldnode, ++level)] = v;
+			if (level > self->ntdepth)
+				self->ntdepth = level;
+			self->ntsplits += 1;
+		} else {
+			level += 1;
+			off = v;
+		}
+	}
+
+	return -1;
+}
+
+/*
+ * Return values:
+ *
+ *   -3: error (exception set)
+ *   -2: not found (no exception set)
+ * rest: valid rev
+ */
+static int index_find_node(indexObject *self,
+			   const char *node, Py_ssize_t nodelen)
+{
+	int rev;
+
+	rev = nt_find(self, node, nodelen);
+	if (rev >= -1)
+		return rev;
+
+	if (self->nt == NULL) {
+		self->ntcapacity = 4096;
+		self->nt = calloc(self->ntcapacity, sizeof(nodetree));
+		if (self->nt == NULL) {
+			PyErr_SetString(PyExc_MemoryError, "out of memory");
+			return -3;
+		}
+		self->ntlength = 1;
+		self->ntrev = (int) index_length(self) - 1;
+		self->ntuses = 0;
+	}
+
+	/*
+	 * For the first handful of lookups, we scan the entire index,
+	 * and cache only the matching nodes. This optimizes for cases
+	 * like "hg tip", where only a few nodes are accessed.
+	 *
+	 * After that, we cache every node we visit, using a single
+	 * scan amortized over multiple lookups.  This gives the best
+	 * bulk performance, e.g. for "hg log".
+	 */
+	if (self->ntuses < 8) {
+		self->ntuses++;
+		for (rev = self->ntrev - 1; rev >= 0; rev--) {
+			const char *n = index_node(self, rev);
+			if (n == NULL)
+				return -2;
+			if (memcmp(node, n, nodelen > 20 ? 20 : nodelen) == 0) {
+				if (nt_insert(self, n, rev) == -1)
+					return -3;
+				break;
+			}
+		}
+	} else {
+		for (rev = self->ntrev - 1; rev >= 0; rev--) {
+			const char *n = index_node(self, rev);
+			if (n == NULL)
+				return -2;
+			if (nt_insert(self, n, rev) == -1)
+				return -3;
+			if (memcmp(node, n, nodelen > 20 ? 20 : nodelen) == 0) {
+				break;
+			}
+		}
+		self->ntrev = rev;
+	}
+
+	if (rev >= 0)
+		return rev;
+	return -2;
+}
+
+static PyObject *raise_revlog_error(void)
+{
+	static PyObject *errclass;
+	PyObject *mod = NULL, *errobj;
+
+	if (errclass == NULL) {
+		PyObject *dict;
+
+		mod = PyImport_ImportModule("mercurial.error");
+		if (mod == NULL)
+			goto classfail;
+
+		dict = PyModule_GetDict(mod);
+		if (dict == NULL)
+			goto classfail;
+
+		errclass = PyDict_GetItemString(dict, "RevlogError");
+		if (errclass == NULL) {
+			PyErr_SetString(PyExc_SystemError,
+					"could not find RevlogError");
+			goto classfail;
+		}
+		Py_INCREF(errclass);
+	}
+
+	errobj = PyObject_CallFunction(errclass, NULL);
+	if (errobj == NULL)
+		return NULL;
+	PyErr_SetObject(errclass, errobj);
+	return errobj;
+
+classfail:
+	Py_XDECREF(mod);
+	return NULL;
+}
+
+static PyObject *index_getitem(indexObject *self, PyObject *value)
+{
+	char *node;
+	Py_ssize_t nodelen;
+	int rev;
+
+	if (PyInt_Check(value))
+		return index_get(self, PyInt_AS_LONG(value));
+
+	if (PyString_AsStringAndSize(value, &node, &nodelen) == -1)
+		return NULL;
+	rev = index_find_node(self, node, nodelen);
+	if (rev >= -1)
+		return PyInt_FromLong(rev);
+	if (rev == -2)
+		raise_revlog_error();
+	return NULL;
+}
+
+static PyObject *index_m_get(indexObject *self, PyObject *args)
+{
+	char *node;
+	int nodelen, rev;
+
+	if (!PyArg_ParseTuple(args, "s#", &node, &nodelen))
+		return NULL;
+
+	rev = index_find_node(self, node, nodelen);
+	if (rev ==  -3)
+		return NULL;
+	if (rev == -2)
+		Py_RETURN_NONE;
+	return PyInt_FromLong(rev);
+}
+
+static int index_contains(indexObject *self, PyObject *value)
+{
+	char *node;
+	Py_ssize_t nodelen;
+
+	if (PyInt_Check(value)) {
+		long rev = PyInt_AS_LONG(value);
+		return rev >= -1 && rev < index_length(self);
+	}
+
+	if (!PyString_Check(value))
+		return 0;
+
+	node = PyString_AS_STRING(value);
+	nodelen = PyString_GET_SIZE(value);
+
+	switch (index_find_node(self, node, nodelen)) {
+	case -3:
+		return -1;
+	case -2:
+		return 0;
+	default:
+		return 1;
+	}
+}
+
+/*
+ * Invalidate any trie entries introduced by added revs.
+ */
+static void nt_invalidate_added(indexObject *self, Py_ssize_t start)
+{
+	Py_ssize_t i, len = PyList_GET_SIZE(self->added);
+
+	for (i = start; i < len; i++) {
+		PyObject *tuple = PyList_GET_ITEM(self->added, i);
+		PyObject *node = PyTuple_GET_ITEM(tuple, 7);
+
+		nt_insert(self, PyString_AS_STRING(node), -1);
+	}
+
+	if (start == 0) {
+		Py_DECREF(self->added);
+		self->added = NULL;
+	}
+}
+
+/*
+ * Delete a numeric range of revs, which must be at the end of the
+ * range, but exclude the sentinel nullid entry.
+ */
+static int index_slice_del(indexObject* self, PyObject* item)
 {
 	Py_ssize_t start, stop, step, slicelength;
 	Py_ssize_t length = index_length(self);
 
-	if (!PySlice_Check(item) || value != NULL) {
-		PyErr_SetString(PyExc_TypeError,
-				"revlog index only supports slice deletion");
-		return -1;
-	}
-
 	if (PySlice_GetIndicesEx((PySliceObject*)item, length,
 				 &start, &stop, &step, &slicelength) < 0)
 		return -1;
@@ -476,20 +888,67 @@
 		return -1;
 	}
 
-	if (start < self->length) {
+	if (start < self->length - 1) {
 		self->length = start + 1;
-		if (self->added) {
-			Py_DECREF(self->added);
-			self->added = NULL;
+
+		if (self->nt) {
+			Py_ssize_t i;
+
+			for (i = start + 1; i < self->length - 1; i++) {
+				const char *node = index_node(self, i);
+
+				if (node)
+					nt_insert(self, node, -1);
+			}
+			if (self->added)
+				nt_invalidate_added(self, 0);
 		}
 		return 0;
 	}
 
-	return PyList_SetSlice(self->added, start - self->length + 1,
-			       PyList_GET_SIZE(self->added),
-			       NULL);
+	if (self->nt)
+		nt_invalidate_added(self, start - self->length + 1);
+	return self->added
+		? PyList_SetSlice(self->added, start - self->length + 1,
+				  PyList_GET_SIZE(self->added), NULL)
+		: 0;
 }
 
+/*
+ * Supported ops:
+ *
+ * slice deletion
+ * string assignment (extend node->rev mapping)
+ * string deletion (shrink node->rev mapping)
+ */
+static int index_assign_subscript(indexObject* self, PyObject* item,
+				  PyObject* value)
+{
+	char *node;
+	Py_ssize_t nodelen;
+	long rev;
+
+	if (PySlice_Check(item) && value == NULL)
+		return index_slice_del(self, item);
+
+	if (node_check(item, &node, &nodelen) == -1)
+		return -1;
+
+	if (value == NULL)
+		return self->nt ? nt_insert(self, node, -1) : 0;
+	rev = PyInt_AsLong(value);
+	if (rev > INT_MAX || rev < 0) {
+		if (!PyErr_Occurred())
+			PyErr_SetString(PyExc_ValueError, "rev out of range");
+		return -1;
+	}
+	return nt_insert(self, node, (int) rev);
+}
+
+/*
+ * Find all RevlogNG entries in an index that has inline data. Update
+ * the optional "offsets" table with those entries.
+ */
 static long inline_scan(indexObject *self, const char **offsets)
 {
 	const char *data = PyString_AS_STRING(self->data);
@@ -533,6 +992,10 @@
 
 	self->added = NULL;
 	self->offsets = NULL;
+	self->nt = NULL;
+	self->ntlength = self->ntcapacity = 0;
+	self->ntdepth = self->ntsplits = 0;
+	self->ntrev = -1;
 	Py_INCREF(self->data);
 
 	if (self->inlined) {
@@ -568,6 +1031,12 @@
 			       PyTuple_GET_ITEM(args, 0));
 }
 
+static PyObject *index_nodemap(indexObject *self)
+{
+	Py_INCREF(self);
+	return (PyObject *) self;
+}
+
 static void index_dealloc(indexObject *self)
 {
 	_index_clearcaches(self);
@@ -581,19 +1050,32 @@
 	0,			 /* sq_concat */
 	0,			 /* sq_repeat */
 	(ssizeargfunc)index_get, /* sq_item */
+	0,			 /* sq_slice */
+	0,			 /* sq_ass_item */
+	0,			 /* sq_ass_slice */
+	(objobjproc)index_contains, /* sq_contains */
 };
 
 static PyMappingMethods index_mapping_methods = {
     (lenfunc)index_length,	/* mp_length */
-    NULL,			/* mp_subscript */
+    (binaryfunc)index_getitem,	/* mp_subscript */
     (objobjargproc)index_assign_subscript, /* mp_ass_subscript */
 };
 
 static PyMethodDef index_methods[] = {
 	{"clearcaches", (PyCFunction)index_clearcaches, METH_NOARGS,
 	 "clear the index caches"},
+	{"get", (PyCFunction)index_m_get, METH_VARARGS,
+	 "get an index entry"},
 	{"insert", (PyCFunction)index_insert, METH_VARARGS,
 	 "insert an index entry"},
+	{"stats", (PyCFunction)index_stats, METH_NOARGS,
+	 "stats for the index"},
+	{NULL}	/* Sentinel */
+};
+
+static PyGetSetDef index_getset[] = {
+	{"nodemap", (getter)index_nodemap, NULL, "nodemap", NULL},
 	{NULL}	/* Sentinel */
 };
 
@@ -628,7 +1110,7 @@
 	0,		 /* tp_iternext */
 	index_methods,		   /* tp_methods */
 	0,			   /* tp_members	*/
-	0,			   /* tp_getset		*/
+	index_getset,		   /* tp_getset		*/
 	0,			   /* tp_base		*/
 	0,			   /* tp_dict		*/
 	0,			   /* tp_descr_get	*/
@@ -640,10 +1122,10 @@
 };
 
 /*
- * returns a tuple of the form (index, None, cache) with elements as
+ * returns a tuple of the form (index, index, cache) with elements as
  * follows:
  *
- * index: an index object that lazily parses the RevlogNG records
+ * index: an index object that lazily parses RevlogNG records
  * cache: if data is inlined, a tuple (index_file_content, 0), else None
  *
  * added complications are for backwards compatibility
@@ -678,6 +1160,8 @@
 		Py_INCREF(cache);
 	}
 
+	Py_INCREF(idx);
+
 	tuple = Py_BuildValue("NN", idx, cache);
 	if (!tuple)
 		goto bail;
@@ -701,8 +1185,6 @@
 
 static void module_init(PyObject *mod)
 {
-	static const char nullid[20];
-
 	if (PyType_Ready(&indexType) < 0)
 		return;
 	Py_INCREF(&indexType);
@@ -737,4 +1219,3 @@
 	module_init(mod);
 }
 #endif
-
diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -174,7 +174,7 @@
     def parseindex(self, data, inline):
         # call the C implementation to parse the index data
         index, cache = parsers.parse_index2(data, inline)
-        return index, None, cache
+        return index, getattr(index, 'nodemap', None), cache
 
     def packentry(self, entry, node, version, rev):
         p = _pack(indexformatng, *entry)
@@ -288,10 +288,21 @@
         self.rev(self.node(0))
         return self._nodecache
 
+    def clearcaches(self):
+        try:
+            self._nodecache.clearcaches()
+        except AttributeError:
+            self._nodecache = {nullid: nullrev}
+            self._nodepos = None
+
     def rev(self, node):
         try:
             return self._nodecache[node]
+        except RevlogError:
+            # parsers.c radix tree lookup failed
+            raise LookupError(node, self.indexfile, _('no node'))
         except KeyError:
+            # pure python cache lookup failed
             n = self._nodecache
             i = self.index
             p = self._nodepos
diff --git a/tests/test-parseindex2.py b/tests/test-parseindex2.py
--- a/tests/test-parseindex2.py
+++ b/tests/test-parseindex2.py
@@ -110,6 +110,17 @@
     if py_res_2 != c_res_2:
         print "Parse index result (no inlined data) differs!"
 
+    ix = parsers.parse_index2(data_inlined, True)[0]
+    for i,r in enumerate(ix):
+        try:
+            if r[7] == nullid:
+                i = -1
+            if ix[r[7]] != i:
+                print 'Reverse lookup inconsistent for %r' % r[7].encode('hex')
+        except:
+            print 'Reverse lookup failed for %r' % r[7].encode('hex')
+            raise
+
     print "done"
 
 runtest()


More information about the Mercurial-devel mailing list