[PATCH 6 of 6 v2] scmutil: speed up case collision checking

Joshua Redstone joshua.redstone at fb.com
Tue Jun 26 14:04:43 CDT 2012


# HG changeset patch
# User Joshua Redstone <joshua.redstone at fb.com>
# Date 1340048916 25200
# Node ID d963a5794553462ee6605eb9efc89a0986c97a71
# Parent  592a6e3e9f4df3e2e3b8a137cb3aaa5143a96451
scmutil: speed up case collision checking

scmutil.casecollisionauditor is slow for large repos.  Switch to a faster
implementation based on reducing the fraction of all files that are scanned
and caching the results of case conversion lazily rather than precomputing.

diff --git a/mercurial/cmdutil.py b/mercurial/cmdutil.py
--- a/mercurial/cmdutil.py
+++ b/mercurial/cmdutil.py
@@ -1202,12 +1202,13 @@
     cca = None
     abort, warn = scmutil.checkportabilityalert(ui)
     if abort or warn:
-        cca = scmutil.casecollisionauditor(ui, abort, wctx)
+        cca = scmutil.casecollisionauditor(ui, abort,
+                                           repo.dirstate.sortedfiles())
     for f in repo.walk(match):
         exact = match.exact(f)
         if exact or not explicitonly and f not in repo.dirstate:
             if cca:
-                cca(f)
+                cca.checkcollision(f)
             names.append(f)
             if ui.verbose or not exact:
                 ui.status(_('adding %s\n') % match.rel(join(f)))
diff --git a/mercurial/scmutil.py b/mercurial/scmutil.py
--- a/mercurial/scmutil.py
+++ b/mercurial/scmutil.py
@@ -9,6 +9,7 @@
 import util, error, osutil, revset, similar, encoding
 import match as matchmod
 import os, errno, re, stat, sys, glob
+import bisect
 
 def nochangesfound(ui, secretlist=None):
     '''report no changes for push/pull'''
@@ -49,22 +50,56 @@
     return abort, warn
 
 class casecollisionauditor(object):
-    def __init__(self, ui, abort, existingiter):
+    def __init__(self, ui, abort, sortedfiles):
         self._ui = ui
         self._abort = abort
-        self._map = {}
-        for f in existingiter:
-            self._map[encoding.lower(f)] = f
+        self._loweredfiles = set()
+        self._lowereddirs = set()
+        self._sortedfiles = sortedfiles
 
-    def __call__(self, f):
-        fl = encoding.lower(f)
-        map = self._map
-        if fl in map and map[fl] != f:
-            msg = _('possible case-folding collision for %s') % f
+    def checkcollision(self, path):
+        """
+        Check if path has a case collision with another file, either
+        one we've checked before or one in a sorted list of files.
+        """
+        dirn = os.path.dirname(path)
+        hit = False
+        loweredpath = encoding.lower(path)
+        if dirn in self._lowereddirs:
+            # All files in dirn (but not subdirs) have been lowercased so we
+            # need only check for the lowered filename in the cache of lowered
+            # files.
+            hit = loweredpath in self._loweredfiles
+        else:
+            dirlevel = path.count('/')
+            # If we're checking a/b/c, and we know there's already a file in the
+            # repo in the dir a/b, then we know that the 'a/b' component can not
+            # have a case collision, so we can limit our search to files in the
+            # tree under a/b.  The prefixmatch call is to find the longest path
+            # component we can use to limit the search.
+            prefix = util.prefixmatch(path, self._sortedfiles)
+            prefixdirslash = os.path.dirname(prefix) + "/"
+            if prefixdirslash == "/":
+                prefixdirslash = ""
+            startidx = bisect.bisect_right(self._sortedfiles, prefixdirslash)
+            hit = False
+            for idx in xrange(startidx, len(self._sortedfiles)):
+                f = self._sortedfiles[idx]
+                if not f.startswith(prefixdirslash):
+                    break
+                if f.count('/') != dirlevel:
+                    continue
+                lowerf = encoding.lower(f)
+                self._loweredfiles.add(lowerf)
+                if lowerf == loweredpath and f != path:
+                    hit = True
+            self._lowereddirs.add(dirn)
+        self._loweredfiles.add(loweredpath)
+        if hit:
+            msg = _('possible case-folding collision for %s') % path
             if self._abort:
                 raise util.Abort(msg)
             self._ui.warn(_("warning: %s\n") % msg)
-        map[fl] = f
 
 class pathauditor(object):
     '''ensure that a filesystem path contains no banned components.
diff --git a/tests/test-casecollision.t b/tests/test-casecollision.t
--- a/tests/test-casecollision.t
+++ b/tests/test-casecollision.t
@@ -31,6 +31,29 @@
   $ hg st
   A A
   A a
+  $ mkdir b
+  $ touch b/c b/D
+  $ hg add b
+  adding b/D
+  adding b/c
+  $ touch b/d b/C
+  $ hg add b/C
+  warning: possible case-folding collision for b/C
+  $ hg add b/d
+  warning: possible case-folding collision for b/d
+  $ touch b/a1 b/a2
+  $ hg add b
+  adding b/a1
+  adding b/a2
+  $ touch b/A2 b/a1.1
+  $ hg add b/a1.1 b/A2
+  warning: possible case-folding collision for b/A2
+  $ touch b/f b/F
+  $ hg add b/f b/F
+  warning: possible case-folding collision for b/f
+  $ touch g G
+  $ hg add g G
+  warning: possible case-folding collision for g
 
 case changing rename must not warn or abort
 


More information about the Mercurial-devel mailing list