[PATCH 1 of 2 RFC] util: reimplement lrucachedict

Mon Nov 23 06:28:50 UTC 2015

# HG changeset patch
# User Gregory Szorc <gregory.szorc at gmail.com>
# Date 1448252535 28800
#      Sun Nov 22 20:22:15 2015 -0800
# Node ID 5d0aff0739984ebdcc5c99a84d338749cd702119
# Parent  160d7e207cc161543c0486c4de223b30621cd6d9
util: reimplement lrucachedict

Previous experience has taught me that collections.deque has pretty
bad performance characteristics. As part of attempting to more
aggressively use the existing lrucachedict, collections.deque
operations were frequently showing up in profiling output, negating
benefits of caching.

Searching the internet seems to tell me that the most efficient
way to implement an LRU cache in Python is to have a dict indexing
the cached entries and then to use a doubly linked list to track
freshness of each entry. So, this patch replaces our existing
lrucachedict with a version based on a doubly linked list.

diff --git a/mercurial/util.py b/mercurial/util.py
--- a/mercurial/util.py
+++ b/mercurial/util.py
@@ -437,44 +437,128 @@ class sortdict(dict):
         return self._list.__iter__()
     def iteritems(self):
         for k in self._list:
             yield k, self[k]
     def insert(self, index, key, val):
         self._list.insert(index, key)
         dict.__setitem__(self, key, val)
 
+class _lrucachenode(object):
+    """A node in a doubly linked list.
+
+    Holds a reference to nodes on either side as well as a key-value
+    pair for the dictionary entry.
+    """
+    def __init__(self):
+        self.next = None
+        self.prev = None
+
+        self.key = _notset
+        self.value = None
+
+    def __nonzero__(self):
+        return self.key is not _notset
+
+    def empty(self):
+        """Mark the node as emptied."""
+        self.key = _notset
+
 class lrucachedict(object):
-    '''cache most recent gets from or sets to this dictionary'''
-    def __init__(self, maxsize):
+    """Dict that caches most recent accesses and sets.
+
+    The dict consists of a actual backing dict, indexed by original
+    key and a doubly linked list defining the order of entries in the
+    cache.
+    """
+    def __init__(self, max):
         self._cache = {}
-        self._maxsize = maxsize
-        self._order = collections.deque()
 
-    def __getitem__(self, key):
-        value = self._cache[key]
-        self._order.remove(key)
-        self._order.append(key)
-        return value
+        self._head = _lrucachenode()
 
-    def __setitem__(self, key, value):
-        if key not in self._cache:
-            if len(self._cache) >= self._maxsize:
-                del self._cache[self._order.popleft()]
-        else:
-            self._order.remove(key)
-        self._cache[key] = value
-        self._order.append(key)
+        last = self._head
+        for i in range(max - 1):
+            node = _lrucachenode()
+            last.next = node
+            node.prev = last
+            last = node
 
-    def __contains__(self, key):
-        return key in self._cache
+        last.next = self._head
+        self._head.prev = last
+
+    def __len__(self):
+        return len(self._cache)
+
+    def __contains__(self, k):
+        return k in self._cache
+
+    def __iter__(self):
+        # We don't have to iterate in cache order, but why not.
+        n = self._head
+        for i in range(len(self._cache)):
+            yield n.key
+            n = n.next
+
+    def __getitem__(self, k):
+        node = self._cache[k]
+
+        self._recordaccess(node)
+        self._head = node
+        return node.value
+
+    def __setitem__(self, k, v):
+        node = self._cache.get(k)
+        if isinstance(node, _lrucachenode):
+            node.value = v
+            self._recordaccess(node)
+            self._head = node
+            return
+
+        # Grab the last item in the linked list.
+        node = self._head.prev
+
+        # At capacity. Kill the old entry.
+        if node:
+            del self._cache[node.key]
+
+        node.key = k
+        node.value = v
+        self._cache[k] = node
+        self._head = node
+
+    def __delitem__(self, k):
+        node = self._cache.pop(k)
+        node.empty()
+
+        self._recordaccess(node)
+        self._head = node.next
+
+    # Additional dict methods.
+
+    def get(self, k, default=None):
+        try:
+            return self._cache[k]
+        except KeyError:
+            return default
 
     def clear(self):
+        n = self._head
+        while n:
+            n.empty()
+            n = n.next
+
         self._cache.clear()
-        self._order = collections.deque()
+
+    def _recordaccess(self, node):
+        node.prev.next = node.next
+        node.next.prev = node.prev
+        node.prev = self._head.prev
+        node.next = self._head.prev.next
+        node.next.prev = node
+        node.prev.next = node
 
 def lrucachefunc(func):
     '''cache most recent results of function calls'''
     cache = {}
     order = collections.deque()
     if func.func_code.co_argcount == 1:
         def f(arg):
             if arg not in cache: