[PATCH 2 of 4] revlog: add a context manager that caches the reading file handle

Gregory Szorc gregory.szorc at gmail.com
Tue Jul 14 15:51:11 CDT 2015


# HG changeset patch
# User Gregory Szorc <gregory.szorc at gmail.com>
# Date 1436900774 25200
#      Tue Jul 14 12:06:14 2015 -0700
# Node ID bcdf6cd26113b06a489b9a3051bcd20a1e44fc4b
# Parent  5154b8421ebd670d6155659c5811f193f5fe1555
revlog: add a context manager that caches the reading file handle

Currently, every time revlog._loadchunk() is called, a new file handle is
opened, seeked, read, and closed. For operations that read several
revisions from the revlog (including `hg log` and changegroup
application), this can incur thousands of opens and closes.

This patch introduces a context manager to the revlog class that enables
the read file handle to be cached across operations that may perform
several revlog reads, thus saving the overhead of the extra open() and
close() operations.

The ability to use a persistent file descriptor was the default behavior
of Mercurial until ef393d6ec030 (May 2009). The reason for the change
wasn't clearly documented in the changeset. It's possible that some
operating systems may not behave well. This will need to be tested.

This patch stops short of actually using this new context manager
anywhere.

diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -13,8 +13,9 @@ and O(changes) merge between branches.
 
 from __future__ import absolute_import
 
 import collections
+import contextlib
 import errno
 import struct
 import zlib
 
@@ -232,8 +233,11 @@ class revlog(object):
         self.index = []
         self._pcache = {}
         self._nodecache = {nullid: nullrev}
         self._nodepos = None
+        # Cached file read-only file descriptors.
+        self._indexfh = None
+        self._datafh = None
 
         v = REVLOG_DEFAULT_VERSION
         opts = getattr(opener, 'options', None)
         if opts is not None:
@@ -948,12 +952,22 @@ class revlog(object):
         else:
             self._chunkcache = offset, data
 
     def _loadchunk(self, offset, length):
+        opened = False
+
         if self._inline:
-            df = self.opener(self.indexfile)
+            if self._indexfh:
+                df = self._indexfh
+            else:
+                df = self.opener(self.indexfile)
+                opened = True
         else:
-            df = self.opener(self.datafile)
+            if self._datafh:
+                df = self._datafh
+            else:
+                df = self.opener(self.datafile)
+                opened = True
 
         # Cache data both forward and backward around the requested
         # data, in a fixed size window. This helps speed up operations
         # involving reading the revlog backwards.
@@ -962,9 +976,10 @@ class revlog(object):
         reallength = (((offset + length + cachesize) & ~(cachesize - 1))
                       - realoffset)
         df.seek(realoffset)
         d = df.read(reallength)
-        df.close()
+        if opened:
+            df.close()
         self._addchunk(realoffset, d)
         if offset != realoffset or reallength != length:
             return util.buffer(d, offset - realoffset, length)
         return d
@@ -1053,8 +1068,56 @@ class revlog(object):
 
         return mdiff.textdiff(self.revision(rev1),
                               self.revision(rev2))
 
+    @contextlib.contextmanager
+    def persistentreadhandle(self):
+        """Context manager that activates a persistent file handle for reading.
+
+        When uncached revlog file data is needed, the revlog data or index file
+        is opened, read, then closed. This prevents file descriptors from
+        accumulating, possibly exhausting the open file descriptor limit.
+        However, this also adds performance overhead.
+
+        When entered, this context manager will enable the read file handle to
+        be cached across multiple read operations. Consumers should take care
+        to not use this context manager on too many revlogs at once, or it may
+        exhaust the open file descriptor limit.
+
+        The context manager value is a function that should be called when the
+        revlog switches from inline to conventional. This will ensure the data
+        file (not just the index) is cached.
+
+        The context manager can be nested. If so, subsequent/nested
+        enters/exits will no-op.
+        """
+
+        # We keep track of whether we opened the file handles so
+        # subsequent/nested enters/exits can no-op.
+        opened = [False, False]
+        def refresh():
+            if self._inline:
+                if not self._indexfh:
+                    self._indexfh = self.opener(self.indexfile)
+                    opened[0] = True
+            else:
+                if not self._datafh:
+                    self._datafh = self.opener(self.datafile)
+                    opened[1] = True
+
+        refresh()
+
+        try:
+            yield refresh
+        finally:
+            if self._indexfh and opened[0]:
+                self._indexfh.close()
+                self._indexfh = None
+
+            if self._datafh and opened[1]:
+                self._datafh.close()
+                self._datafh = None
+
     def revision(self, nodeorrev):
         """return an uncompressed revision of a given node or revision
         number.
         """


More information about the Mercurial-devel mailing list