[PATCH RFC] faster dirstate walks

Chris Mason mason at suse.com
Thu Aug 25 13:29:07 CDT 2005


Hello everyone,

Here are some quick and dirty mods to the dirstate walking code.  This 
generally cuts the time for hg status/diff in half, from 2s down to 1s.  
The main parts I'm trying to optimize are:

1) os.walk stats every file.  dirstate.changes then stats every file again.

2) os.walk yields every file and subdir to dirstate.traverse who yields every 
file and everything in the dirstate map.  dirstate.walk then 
filters this mass and yields every file to the caller.  There should be
fewer steps in here.

3) dirstate.walk runs util.unique on the results from dirstate.traverse, 
even though it is also passing things through dirstate.seen to look for 
duplicates.

I've turned os.walk into something hg specific that takes all the dirstate 
ignore and matching rules into account.  The new function also takes an 
optional function (interesting()) the caller can supply to help filter out 
files it doesn't care about.  dirstate.changes uses this to update state 
for each file, avoiding the second stat call.

Testing has been extremely light, this is really just proof of concept code.  
One big functional change is the results of dirstate.walk are no longer sorted.  
If the caller really wants sorted results (most don't I think) they should 
collect into an array and sort themselves.

If the basic idea here looks good I can clean it up a little more.  I'm not
thrilled with my duplicate but slightly different checks against 
ignore and match, but the rest seems sound.

-chris

---
Index: mine/mercurial/hg.py
===================================================================
--- mine.orig/mercurial/hg.py	2005-08-25 13:44:11.000000000 -0400
+++ mine/mercurial/hg.py	2005-08-25 14:01:23.000000000 -0400
@@ -480,7 +480,7 @@ class dirstate:
                 bs += 1
         return ret
 
-    def walk(self, files = None, match = util.always, dc=None):
+    def walk(self, files = None, match=util.always, dc=None, interesting=None):
         self.read()
 
         # walk all files by default
@@ -491,66 +491,73 @@ class dirstate:
         elif not dc:
             dc = self.filterfiles(files)
 
+        # recursion free walker, faster than os.walk.
+        def __walk(s):
+            work = [s]
+            while work:
+                x = work.pop()
+                names = os.listdir(x)
+                top = x
+                d = x[len(self.root) + 1:]
+                nd = util.normpath(d) 
+                if nd == '.': nd = ''
+                for name in names:
+                    np = os.path.join(nd, name)
+                    if seen(np):
+                        continue
+                    p = os.path.join(top, name)
+                    st = os.stat(p)
+                    if stat.S_ISDIR(st.st_mode):
+                        ds = os.path.join(nd, name +'/')
+                        if not self.ignore(ds) and \
+                           (match is util.always or match(ds)):
+                            work.append(p)
+                    else:
+                        if (not interesting or interesting(np, st)) and \
+                           (np in dc or not self.ignore(np)) and \
+                           (match is util.always or match(np)):
+                            yield np
+
         known = {'.hg': 1}
         def seen(fn):
             if fn in known: return True
             known[fn] = 1
-        def traverse():
-            for ff in util.unique(files):
-                f = os.path.join(self.root, ff)
-                try:
-                    st = os.stat(f)
-                except OSError, inst:
-                    if ff not in dc: self.ui.warn('%s: %s\n' % (
-                        util.pathto(self.getcwd(), ff),
-                        inst.strerror))
+
+        for ff in util.unique(files):
+            f = os.path.join(self.root, ff)
+            try:
+                st = os.stat(f)
+            except OSError, inst:
+                if ff not in dc: self.ui.warn('%s: %s\n' % (
+                    util.pathto(self.getcwd(), ff),
+                    inst.strerror))
+                continue
+            if stat.S_ISDIR(st.st_mode):
+                for fl in __walk(f):
+                    yield 'f', util.pconvert(fl)
+            elif stat.S_ISREG(st.st_mode):
+                ff = util.normpath(ff)
+                if seen(ff) or (ff not in dc and self.ignore(ff)):
                     continue
-                if stat.S_ISDIR(st.st_mode):
-                    for dir, subdirs, fl in os.walk(f):
-                        d = dir[len(self.root) + 1:]
-                        nd = util.normpath(d)
-                        if nd == '.': nd = ''
-                        if seen(nd):
-                            subdirs[:] = []
-                            continue
-                        for sd in subdirs:
-                            ds = os.path.join(nd, sd +'/')
-                            if self.ignore(ds) or not match(ds):
-                                subdirs.remove(sd)
-                        subdirs.sort()
-                        fl.sort()
-                        for fn in fl:
-                            fn = util.pconvert(os.path.join(d, fn))
-                            yield 'f', fn
-                elif stat.S_ISREG(st.st_mode):
+                if match is util.always or match(ff):
                     yield 'f', ff
-                else:
-                    kind = 'unknown'
-                    if stat.S_ISCHR(st.st_mode): kind = 'character device'
-                    elif stat.S_ISBLK(st.st_mode): kind = 'block device'
-                    elif stat.S_ISFIFO(st.st_mode): kind = 'fifo'
-                    elif stat.S_ISLNK(st.st_mode): kind = 'symbolic link'
-                    elif stat.S_ISSOCK(st.st_mode): kind = 'socket'
-                    self.ui.warn('%s: unsupported file type (type is %s)\n' % (
-                        util.pathto(self.getcwd(), ff),
-                        kind))
-
-            ks = dc.keys()
-            ks.sort()
-            for k in ks:
+            else:
+                kind = 'unknown'
+                if stat.S_ISCHR(st.st_mode): kind = 'character device'
+                elif stat.S_ISBLK(st.st_mode): kind = 'block device'
+                elif stat.S_ISFIFO(st.st_mode): kind = 'fifo'
+                elif stat.S_ISLNK(st.st_mode): kind = 'symbolic link'
+                elif stat.S_ISSOCK(st.st_mode): kind = 'socket'
+                self.ui.warn('%s: unsupported file type (type is %s)\n' % (
+                    util.pathto(self.getcwd(), ff),
+                    kind))
+
+        ks = dc.keys()
+        ks.sort()
+        for k in ks:
+            if not seen(k) and (match is util.always or match(k)):
                 yield 'm', k
 
-        # yield only files that match: all in dirstate, others only if
-        # not in .hgignore
-
-        for src, fn in util.unique(traverse()):
-            fn = util.normpath(fn)
-            if seen(fn): continue
-            if fn not in dc and self.ignore(fn):
-                continue
-            if match(fn):
-                yield src, fn
-
     def changes(self, files=None, match=util.always):
         self.read()
         if not files:
@@ -560,28 +567,33 @@ class dirstate:
         lookup, modified, added, unknown = [], [], [], []
         removed, deleted = [], []
 
-        for src, fn in self.walk(files, match, dc=dc):
-            try:
-                s = os.stat(os.path.join(self.root, fn))
-            except OSError:
-                continue
+        def interesting(fn, s):
             if not stat.S_ISREG(s.st_mode):
-                continue
+                return False
             c = dc.get(fn)
             if c:
                 del dc[fn]
-                if c[0] == 'm':
+                type, mode, size, time = c
+                if type == 'm':
                     modified.append(fn)
-                elif c[0] == 'a':
+                elif type == 'a':
                     added.append(fn)
-                elif c[0] == 'r':
+                elif type == 'r':
                     unknown.append(fn)
-                elif c[2] != s.st_size or (c[1] ^ s.st_mode) & 0100:
+                elif size != s.st_size or (mode ^ s.st_mode) & 0100:
                     modified.append(fn)
-                elif c[3] != s.st_mtime:
+                elif time != s.st_mtime:
                     lookup.append(fn)
             else:
                 unknown.append(fn)
+            return False
+
+        for src, fn in self.walk(files, match, dc=dc, interesting=interesting):
+            try:
+                s = os.stat(os.path.join(self.root, fn))
+            except OSError:
+                continue
+            interesting(fn, s)
 
         for fn, c in [(fn, c) for fn, c in dc.items() if match(fn)]:
             if c[0] == 'r':


More information about the Mercurial mailing list