[PATCH] revlog: speed up prefix matching against nodes

Bryan O'Sullivan bos at serpentine.com
Wed May 2 18:37:59 CDT 2012


# HG changeset patch
# User Bryan O'Sullivan <bryano at fb.com>
# Date 1336001830 25200
# Branch stable
# Node ID 365a246d027bb04223f52cdeab9772aafab5d83e
# Parent  c82db2287f3b11d191dd46240fdb9439f9a72448
revlog: speed up prefix matching against nodes

The radix tree already contains all the information we need to
determine whether a short string is an unambiguous node identifier.
We now make use of this information.

In a kernel tree, this improves the performance of
"hg log -q -r24bf01de75" from 0.27 seconds to 0.06.

diff -r c82db2287f3b -r 365a246d027b mercurial/parsers.c
--- a/mercurial/parsers.c	Wed May 02 16:27:04 2012 -0700
+++ b/mercurial/parsers.c	Wed May 02 16:37:10 2012 -0700
@@ -544,11 +544,19 @@
 	return v & 0xf;
 }
 
+/*
+ * Return values:
+ *
+ *   -4: multiple matches
+ *   -2: not found
+ * rest: valid rev
+ */
 static int nt_find(indexObject *self, const char *node, Py_ssize_t nodelen)
 {
 	int level, off;
 
-	if (nodelen == 20 && node[0] == '\0' && memcmp(node, nullid, 20) == 0)
+	if (nodelen >= 2 && node[0] == '\0' &&
+	    memcmp(node, nullid, nodelen < 20 ? nodelen : 20) == 0)
 		return -1;
 
 	if (self->nt == NULL)
@@ -572,7 +580,8 @@
 			return -2;
 		off = v;
 	}
-	return -2;
+	/* multiple matches against an ambiguous prefix */
+	return -4;
 }
 
 static int nt_new(indexObject *self)
@@ -636,6 +645,24 @@
 	return -1;
 }
 
+static int nt_init(indexObject *self)
+{
+	if (self->nt == NULL) {
+		self->ntcapacity = self->raw_length < 4
+			? 4 : self->raw_length / 2;
+		self->nt = calloc(self->ntcapacity, sizeof(nodetree));
+		if (self->nt == NULL) {
+			PyErr_NoMemory();
+			return -1;
+		}
+		self->ntlength = 1;
+		self->ntrev = (int)index_length(self) - 1;
+		self->ntlookups = 1;
+		self->ntmisses = 0;
+	}
+	return 0;
+}
+
 /*
  * Return values:
  *
@@ -653,19 +680,8 @@
 	if (rev >= -1)
 		return rev;
 
-	if (self->nt == NULL) {
-		self->ntcapacity = self->raw_length < 4
-			? 4 : self->raw_length / 2;
-		self->nt = calloc(self->ntcapacity, sizeof(nodetree));
-		if (self->nt == NULL) {
-			PyErr_SetString(PyExc_MemoryError, "out of memory");
-			return -3;
-		}
-		self->ntlength = 1;
-		self->ntrev = (int)index_length(self) - 1;
-		self->ntlookups = 1;
-		self->ntmisses = 0;
-	}
+	if (nt_init(self) == -1)
+		return -3;
 
 	/*
 	 * For the first handful of lookups, we scan the entire index,
@@ -761,6 +777,109 @@
 	return NULL;
 }
 
+static int allhex(const char *str, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {
+		char c = str[i];
+		if ((c < '0' || c > '9') && (c < 'a' || c > 'f') &&
+		    (c < 'A' || c > 'F'))
+			return 0;
+	}
+
+	return 1;
+}
+
+static int nt_partialmatch(indexObject *self, const char *node,
+			   Py_ssize_t nodelen)
+{
+	int rev;
+
+	if (nt_init(self) == -1)
+		return -3;
+
+	if (self->ntrev > 0) {
+		for (rev = self->ntrev - 1; rev >= 0; rev--) {
+			const char *n = index_node(self, rev);
+			if (n == NULL)
+				return -2;
+			if (nt_insert(self, n, rev) == -1)
+				return -3;
+		}
+		self->ntrev = rev;
+	}
+
+	return nt_find(self, node, nodelen);
+}
+
+static PyObject *index_partialmatch(indexObject *self, PyObject *args)
+{
+	PyObject *bin, *ret = NULL;
+	Py_ssize_t nodelen;
+	char *orig, *node;
+	int origlen;
+	int rev;
+
+	if (!PyArg_ParseTuple(args, "s#", &orig, &origlen))
+		return NULL;
+
+	if (origlen < 2) {
+		PyErr_SetString(PyExc_ValueError, "key too short");
+		return NULL;
+	}
+
+	if (allhex(orig, origlen)) {
+		bin = unhexlify(orig, origlen & ~1);
+		if (bin == NULL)
+			goto done;
+		node = PyString_AS_STRING(bin);
+		nodelen = PyString_GET_SIZE(bin);
+	} else {
+		bin = NULL;
+		node = orig;
+		nodelen = origlen;
+	}
+
+	rev = nt_partialmatch(self, node, nodelen);
+	if (rev == -3)
+		goto done;
+	if (rev == -4) {
+		raise_revlog_error();
+		goto done;
+	}
+	if (rev == -2 && bin != NULL) {
+		/* Tiny chance that the hex string we decoded was actually
+		 * binary that happened to be all hex digits. */
+		rev = nt_find(self, orig, origlen);
+		if (rev == -4) {
+			raise_revlog_error();
+			goto done;
+		}
+	}
+	if (rev == -1) {
+		ret = PyString_FromStringAndSize(nullid, 20);
+		goto done;
+	}
+	if (rev >= 0) {
+		const char *node = index_node(self, rev);
+		if (node == NULL) {
+			PyErr_Format(PyExc_IndexError,
+				     "could not access rev %d", rev);
+			goto done;
+		}
+		ret = PyString_FromStringAndSize(node, 20);
+		goto done;
+	}
+
+	ret = Py_None;
+	Py_INCREF(Py_None);
+
+done:
+	Py_XDECREF(bin);
+	return ret;
+}
+
 static PyObject *index_m_get(indexObject *self, PyObject *args)
 {
 	char *node;
@@ -1410,6 +1529,8 @@
 	 "get an index entry"},
 	{"insert", (PyCFunction)index_insert, METH_VARARGS,
 	 "insert an index entry"},
+	{"partialmatch", (PyCFunction)index_partialmatch, METH_VARARGS,
+	 "match a potentially ambiguous node ID"},
 	{"stats", (PyCFunction)index_stats, METH_NOARGS,
 	 "stats for the index"},
 	{NULL} /* Sentinel */
diff -r c82db2287f3b -r 365a246d027b mercurial/revlog.py
--- a/mercurial/revlog.py	Wed May 02 16:27:04 2012 -0700
+++ b/mercurial/revlog.py	Wed May 02 16:37:10 2012 -0700
@@ -753,6 +753,15 @@
                 pass
 
     def _partialmatch(self, id):
+        try:
+            return self.index.partialmatch(id)
+        except RevlogError:
+            # parsers.c radix tree lookup gave multiple matches
+            raise LookupError(id, self.indexfile, _("ambiguous identifier"))
+        except (AttributeError, ValueError):
+            # we are pure python, or key was too short to search radix tree
+            pass
+
         if id in self._pcache:
             return self._pcache[id]
 


More information about the Mercurial-devel mailing list