[PATCH 1 of 2] dirstate: track updated files to improve write time

Durham Goode durham at fb.com
Mon Mar 6 00:24:11 UTC 2017


# HG changeset patch
# User Durham Goode <durham at fb.com>
# Date 1488759607 28800
#      Sun Mar 05 16:20:07 2017 -0800
# Node ID 4a751a57ef0682dc3d7e46bdfa67a26b13cd9031
# Parent  9f62182a506396ee246ee5ec535c076caff98bde
dirstate: track updated files to improve write time

Previously, dirstate.write() would iterate over the entire dirstate to find any
entries that needed to be marked 'lookup' (i.e. if they have the same timestamp
as now). This was O(working copy) and slow in large repos. It was most visible
when rebasing or histediting multiple commits, since it gets executed once per
commit, even if the entire rebase/histedit is wrapped in a transaction.

The fix is to track which files have been editted, and only check those to see
if they need to be marked as 'lookup'. This saves 25% on histedit times in very
large repositories.

I tested this by adding temporary debug logic to verify that the old files
processed in the loop matched the new files processed in the loop and running
the test suite.

diff --git a/mercurial/dirstate.py b/mercurial/dirstate.py
--- a/mercurial/dirstate.py
+++ b/mercurial/dirstate.py
@@ -89,6 +89,7 @@ class dirstate(object):
         self._pendingfilename = '%s.pending' % self._filename
         self._plchangecallbacks = {}
         self._origpl = None
+        self._updatedfiles = set()
 
         # for consistent view between _pl() and _read() invocations
         self._pendingmode = None
@@ -431,6 +432,7 @@ class dirstate(object):
                 delattr(self, a)
         self._lastnormaltime = 0
         self._dirty = False
+        self._updatedfiles.clear()
         self._parentwriters = 0
         self._origpl = None
 
@@ -441,8 +443,11 @@ class dirstate(object):
         self._dirty = True
         if source is not None:
             self._copymap[dest] = source
+            self._updatedfiles.add(source)
+            self._updatedfiles.add(dest)
         elif dest in self._copymap:
             del self._copymap[dest]
+            self._updatedfiles.add(dest)
 
     def copied(self, file):
         return self._copymap.get(file, None)
@@ -459,6 +464,8 @@ class dirstate(object):
             if normed in self._filefoldmap:
                 del self._filefoldmap[normed]
 
+        self._updatedfiles.add(f)
+
     def _addpath(self, f, state, mode, size, mtime):
         oldstate = self[f]
         if state == 'a' or oldstate == 'r':
@@ -475,6 +482,7 @@ class dirstate(object):
         if oldstate in "?r" and "_dirs" in self.__dict__:
             self._dirs.addpath(f)
         self._dirty = True
+        self._updatedfiles.add(f)
         self._map[f] = dirstatetuple(state, mode, size, mtime)
         if state != 'n' or mtime == -1:
             self._nonnormalset.add(f)
@@ -656,6 +664,7 @@ class dirstate(object):
         self._copymap = {}
         self._pl = [nullid, nullid]
         self._lastnormaltime = 0
+        self._updatedfiles.clear()
         self._dirty = True
 
     def rebuild(self, parent, allfiles, changedfiles=None):
@@ -692,13 +701,15 @@ class dirstate(object):
             # emulate dropping timestamp in 'parsers.pack_dirstate'
             now = _getfsnow(self._opener)
             dmap = self._map
-            for f, e in dmap.iteritems():
-                if e[0] == 'n' and e[3] == now:
+            for f in self._updatedfiles:
+                e = dmap.get(f)
+                if e is not None and e[0] == 'n' and e[3] == now:
                     dmap[f] = dirstatetuple(e[0], e[1], e[2], -1)
                     self._nonnormalset.add(f)
 
             # emulate that all 'dirstate.normal' results are written out
             self._lastnormaltime = 0
+            self._updatedfiles.clear()
 
             # delay writing in-memory changes out
             tr.addfilegenerator('dirstate', (self._filename,),


More information about the Mercurial-devel mailing list