[PATCH 6 of 6] dirs: reuse strings in _incdir

Bryan O'Sullivan bos at serpentine.com
Thu Mar 28 20:22:48 CDT 2013


# HG changeset patch
# User Bryan O'Sullivan <bryano at fb.com>
# Date 1364520157 25200
#      Thu Mar 28 18:22:37 2013 -0700
# Node ID b46e9a87a202d0c001ee6e25c0ce47c8c56fbd5e
# Parent  96da5c8a645016dc0b68855ff26293ceb1188cf2
dirs: reuse strings in _incdir

In the common case where we are updating a refcount for an existing
directory, this reduces the number of memory allocations and deallocations
from O(n) to 1, where n is the number of '/' characters in a path.

Below are some cumulative effects of these three changes (combining
the original naive C code, and the two speedups) relative to the base
Python code.

perfdirs performance in a working dir with 170,000 files:

  Python      650  msec
  now         167

Commands run in the same working dir show that this speedup is visible
in practice:

  hg add
    Python   1.09  sec
    now       .60

  hg addremove
    Python   4.79
    now      4.27

  hg rebase -d @^
    Python   17.2
    now      15.5

diff --git a/mercurial/dirs.c b/mercurial/dirs.c
--- a/mercurial/dirs.c
+++ b/mercurial/dirs.c
@@ -15,6 +15,9 @@
  * We violate the Python rule that integers are immutable. Said
  * integers are used only for internal refcounting by this code, and
  * are not (and must not be) used by Python code.
+ *
+ * We also violate the rule that strings are immutable, but this is an
+ * internal implementation detail that is not visible to Python code.
  */
 
 static inline Py_ssize_t _finddir(PyObject *path, Py_ssize_t pos)
@@ -32,17 +35,25 @@ static inline Py_ssize_t _finddir(PyObje
 
 static int _incdirs(PyObject *dirs, PyObject *path)
 {
+	const char *cpath = PyString_AS_STRING(path);
 	Py_ssize_t pos = PyString_GET_SIZE(path);
-	PyObject *newval = NULL, *key = NULL;
+	PyObject *newval = NULL, *key;
 	int ret = -1;
 
+	/* It's likely that every prefix already has an entry in the
+	   map. Try to avoid allocating and deallocating a string for
+	   each prefix we check. */
+	key = PyString_FromStringAndSize(cpath, pos);
+
+	if (key == NULL)
+		goto bail;
+
 	while ((pos = _finddir(path, pos - 1)) != -1) {
-		PyObject *val;
+		PyObject *val, *newkey;
 
-		key = PyString_FromStringAndSize(PyString_AS_STRING(path), pos);
-
-		if (key == NULL)
-			goto bail;
+		((PyStringObject *) key)->ob_shash = -1;
+		PyString_GET_SIZE(key) = pos;
+		PyString_AS_STRING(key)[pos] = '\0';
 
 		val = PyDict_GetItem(dirs, key);
 		/* Avoid allocating and deallocating an int every time
@@ -57,7 +68,6 @@ static int _incdirs(PyObject *dirs, PyOb
 				goto bail;
 			}
 			PyInt_AS_LONG(val) += 1;
-			Py_CLEAR(key);
 			continue;
 		}
 
@@ -72,7 +82,13 @@ static int _incdirs(PyObject *dirs, PyOb
 		ret = PyDict_SetItem(dirs, key, newval);
 		if (ret == -1)
 			goto bail;
-		Py_CLEAR(key);
+		newkey = PyString_FromStringAndSize(PyString_AS_STRING(key),
+						    pos - 1);
+		if (newkey == NULL)
+			goto bail;
+
+		Py_DECREF(key);
+		key = newkey;
 		Py_CLEAR(newval);
 	}
 	ret = 0;


More information about the Mercurial-devel mailing list