[PATCH] revlog: speed up prefix matching against nodes
Bryan O'Sullivan
bos at serpentine.com
Wed May 2 18:37:59 CDT 2012
# HG changeset patch
# User Bryan O'Sullivan <bryano at fb.com>
# Date 1336001830 25200
# Branch stable
# Node ID 365a246d027bb04223f52cdeab9772aafab5d83e
# Parent c82db2287f3b11d191dd46240fdb9439f9a72448
revlog: speed up prefix matching against nodes
The radix tree already contains all the information we need to
determine whether a short string is an unambiguous node identifier.
We now make use of this information.
In a kernel tree, this improves the performance of
"hg log -q -r24bf01de75" from 0.27 seconds to 0.06.
diff -r c82db2287f3b -r 365a246d027b mercurial/parsers.c
--- a/mercurial/parsers.c Wed May 02 16:27:04 2012 -0700
+++ b/mercurial/parsers.c Wed May 02 16:37:10 2012 -0700
@@ -544,11 +544,19 @@
return v & 0xf;
}
+/*
+ * Return values:
+ *
+ * -4: multiple matches
+ * -2: not found
+ * rest: valid rev
+ */
static int nt_find(indexObject *self, const char *node, Py_ssize_t nodelen)
{
int level, off;
- if (nodelen == 20 && node[0] == '\0' && memcmp(node, nullid, 20) == 0)
+ if (nodelen >= 2 && node[0] == '\0' &&
+ memcmp(node, nullid, nodelen < 20 ? nodelen : 20) == 0)
return -1;
if (self->nt == NULL)
@@ -572,7 +580,8 @@
return -2;
off = v;
}
- return -2;
+ /* multiple matches against an ambiguous prefix */
+ return -4;
}
static int nt_new(indexObject *self)
@@ -636,6 +645,24 @@
return -1;
}
+static int nt_init(indexObject *self)
+{
+ if (self->nt == NULL) {
+ self->ntcapacity = self->raw_length < 4
+ ? 4 : self->raw_length / 2;
+ self->nt = calloc(self->ntcapacity, sizeof(nodetree));
+ if (self->nt == NULL) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ self->ntlength = 1;
+ self->ntrev = (int)index_length(self) - 1;
+ self->ntlookups = 1;
+ self->ntmisses = 0;
+ }
+ return 0;
+}
+
/*
* Return values:
*
@@ -653,19 +680,8 @@
if (rev >= -1)
return rev;
- if (self->nt == NULL) {
- self->ntcapacity = self->raw_length < 4
- ? 4 : self->raw_length / 2;
- self->nt = calloc(self->ntcapacity, sizeof(nodetree));
- if (self->nt == NULL) {
- PyErr_SetString(PyExc_MemoryError, "out of memory");
- return -3;
- }
- self->ntlength = 1;
- self->ntrev = (int)index_length(self) - 1;
- self->ntlookups = 1;
- self->ntmisses = 0;
- }
+ if (nt_init(self) == -1)
+ return -3;
/*
* For the first handful of lookups, we scan the entire index,
@@ -761,6 +777,109 @@
return NULL;
}
+static int allhex(const char *str, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ char c = str[i];
+ if ((c < '0' || c > '9') && (c < 'a' || c > 'f') &&
+ (c < 'A' || c > 'F'))
+ return 0;
+ }
+
+ return 1;
+}
+
+static int nt_partialmatch(indexObject *self, const char *node,
+ Py_ssize_t nodelen)
+{
+ int rev;
+
+ if (nt_init(self) == -1)
+ return -3;
+
+ if (self->ntrev > 0) {
+ for (rev = self->ntrev - 1; rev >= 0; rev--) {
+ const char *n = index_node(self, rev);
+ if (n == NULL)
+ return -2;
+ if (nt_insert(self, n, rev) == -1)
+ return -3;
+ }
+ self->ntrev = rev;
+ }
+
+ return nt_find(self, node, nodelen);
+}
+
+static PyObject *index_partialmatch(indexObject *self, PyObject *args)
+{
+ PyObject *bin, *ret = NULL;
+ Py_ssize_t nodelen;
+ char *orig, *node;
+ int origlen;
+ int rev;
+
+ if (!PyArg_ParseTuple(args, "s#", &orig, &origlen))
+ return NULL;
+
+ if (origlen < 2) {
+ PyErr_SetString(PyExc_ValueError, "key too short");
+ return NULL;
+ }
+
+ if (allhex(orig, origlen)) {
+ bin = unhexlify(orig, origlen & ~1);
+ if (bin == NULL)
+ goto done;
+ node = PyString_AS_STRING(bin);
+ nodelen = PyString_GET_SIZE(bin);
+ } else {
+ bin = NULL;
+ node = orig;
+ nodelen = origlen;
+ }
+
+ rev = nt_partialmatch(self, node, nodelen);
+ if (rev == -3)
+ goto done;
+ if (rev == -4) {
+ raise_revlog_error();
+ goto done;
+ }
+ if (rev == -2 && bin != NULL) {
+ /* Tiny chance that the hex string we decoded was actually
+ * binary that happened to be all hex digits. */
+ rev = nt_find(self, orig, origlen);
+ if (rev == -4) {
+ raise_revlog_error();
+ goto done;
+ }
+ }
+ if (rev == -1) {
+ ret = PyString_FromStringAndSize(nullid, 20);
+ goto done;
+ }
+ if (rev >= 0) {
+ const char *node = index_node(self, rev);
+ if (node == NULL) {
+ PyErr_Format(PyExc_IndexError,
+ "could not access rev %d", rev);
+ goto done;
+ }
+ ret = PyString_FromStringAndSize(node, 20);
+ goto done;
+ }
+
+ ret = Py_None;
+ Py_INCREF(Py_None);
+
+done:
+ Py_XDECREF(bin);
+ return ret;
+}
+
static PyObject *index_m_get(indexObject *self, PyObject *args)
{
char *node;
@@ -1410,6 +1529,8 @@
"get an index entry"},
{"insert", (PyCFunction)index_insert, METH_VARARGS,
"insert an index entry"},
+ {"partialmatch", (PyCFunction)index_partialmatch, METH_VARARGS,
+ "match a potentially ambiguous node ID"},
{"stats", (PyCFunction)index_stats, METH_NOARGS,
"stats for the index"},
{NULL} /* Sentinel */
diff -r c82db2287f3b -r 365a246d027b mercurial/revlog.py
--- a/mercurial/revlog.py Wed May 02 16:27:04 2012 -0700
+++ b/mercurial/revlog.py Wed May 02 16:37:10 2012 -0700
@@ -753,6 +753,15 @@
pass
def _partialmatch(self, id):
+ try:
+ return self.index.partialmatch(id)
+ except RevlogError:
+ # parsers.c radix tree lookup gave multiple matches
+ raise LookupError(id, self.indexfile, _("ambiguous identifier"))
+ except (AttributeError, ValueError):
+ # we are pure python, or key was too short to search radix tree
+ pass
+
if id in self._pcache:
return self._pcache[id]
More information about the Mercurial-devel
mailing list