[PATCH] Issue #839: prevent any file paths under .hg/store/data/ from getting too long

Jesse Glick Jesse.Glick at Sun.COM
Mon Jun 30 14:32:33 CDT 2008


# HG changeset patch
# User Jesse Glick <jesse.glick at sun.com>
# Date 1214853828 14400
# Node ID 486d249d8e2188bf8309f2cad5f8b7024f06ec24
# Parent  88a1bcc5c6a7b1729eb97d0509531c0f9f4a606e
Issue #839: prevent any file paths under .hg/store/data/ from getting too long.
If a working copy file has a path which is very long and uses many underscores
or uppercase letters, the repo could be especially long. Since Windows imposes a
path length maximum around 256 characters, this can make the repository
impossible to check out for Windows users. Worse, you cannot correct the problem
using 'hg ren' to shorten path names, since the old repo files will never be
deleted! Fix is to check for potentially overlong repo names. If found, truncate
them, inject a hash of the full path to prevent clashes, and write the full path
to .hg/store/longnames so it can be recovered in the future. Since this is an
incompatible repository format change, it has to be represented as a new
requirement.

diff --git a/mercurial/hg.py b/mercurial/hg.py
--- a/mercurial/hg.py
+++ b/mercurial/hg.py
@@ -198,6 +198,7 @@
             dest_lock = lock.lock(os.path.join(dest_store, "lock"))
 
             files = ("data",
+                     "longnames",
                      "00manifest.d", "00manifest.i",
                      "00changelog.d", "00changelog.i")
             for f in files:
diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
--- a/mercurial/localrepo.py
+++ b/mercurial/localrepo.py
@@ -12,10 +12,11 @@
 import lock, transaction, stat, errno, ui
 import os, revlog, time, util, extensions, hook, inspect
 import match as match_
+import sha, re
 
 class localrepository(repo.repository):
     capabilities = util.set(('lookup', 'changegroupsubset'))
-    supported = ('revlogv1', 'store')
+    supported = ('revlogv1', 'store', 'longnames')
 
     def __init__(self, parentui, path=None, create=0):
         repo.repository.__init__(self)
@@ -60,30 +61,7 @@
             if r not in self.supported:
                 raise repo.RepoError(_("requirement '%s' not supported") % r)
 
-        # setup store
-        if "store" in requirements:
-            self.encodefn = util.encodefilename
-            self.decodefn = util.decodefilename
-            self.spath = os.path.join(self.path, "store")
-        else:
-            self.encodefn = lambda x: x
-            self.decodefn = lambda x: x
-            self.spath = self.path
-
-        try:
-            # files in .hg/ will be created using this mode
-            mode = os.stat(self.spath).st_mode
-            # avoid some useless chmods
-            if (0777 & ~util._umask) == (0777 & mode):
-                mode = None
-        except OSError:
-            mode = None
-
-        self._createmode = mode
-        self.opener.createmode = mode
-        sopener = util.opener(self.spath)
-        sopener.createmode = mode
-        self.sopener = util.encodedopener(sopener, self.encodefn)
+        self._setup_store(requirements, util.opener, os.path.join)
 
         self.ui = ui.ui(parentui=parentui)
         try:
@@ -101,6 +79,94 @@
         self.filterpats = {}
         self._datafilters = {}
         self._transref = self._lockref = self._wlockref = None
+
+    def _setup_store(self, requirements, opener, pathjoiner):
+        if "store" in requirements:
+            self._longnames = None
+            def load_longnames():
+                if self._longnames == None:
+                    self._longnames = {}
+                    self._longnames_transient = {}
+                    try:
+                        self._longnames_file = opener(self.spath)('longnames',
+                                                                  mode='a+')
+                        for line in self._longnames_file:
+                            datapath = line[0:-1]
+                            sha1 = sha.new(datapath).hexdigest()
+                            self._longnames[sha1] = datapath
+                    except IOError, err:
+                        if err.errno != errno.ENOENT:
+                            raise
+            maxlen = 150 - len('.hg/store/')
+            def encode(s, write=False):
+                r = util.encodefilename(s)
+                if not s.startswith('data/'):
+                    return r
+                if len(r) <= maxlen:
+                    return r
+                try:
+                    opener(self.spath)(r, "rb").close()
+                    return r
+                except IOError:
+                    pass
+                datapath = s[5:]
+                sha1 = sha.new(datapath).hexdigest()
+                load_longnames()
+                if not sha1 in self._longnames:
+                    if write:
+                        self._longnames[sha1] = datapath
+                        self._longnames_file.write(datapath + '\n')
+                        self._longnames_file.flush()
+                        reqfile = self.opener("requires", "a+")
+                        if 'longnames\n' not in reqfile.read().splitlines():
+                            reqfile.write('longnames\n')
+                            reqfile.close()
+                        else:
+                            self._longnames_transient[sha1] = datapath
+                return 'data/_=' + sha1 + '-' + \
+                    re.sub(r"[^a-z0-9.-]", '_', datapath[48-maxlen:].lower())
+            self.encodefn = encode
+            def decode(s):
+                r = util.decodefilename(s)
+                if s.startswith('data/_='):
+                    sha1 = s[7:].split('-', 2)[0]
+                    load_longnames()
+                    if sha1 in self._longnames:
+                        r = 'data/' + self._longnames[sha1]
+                    else:
+                        r = 'data/' + self._longnames_transient[sha1]
+                return r
+            self.decodefn = decode
+            self.spath = pathjoiner(self.path, "store")
+        else:
+            def encode(s, write=False):
+                return s
+            self.encodefn = encode
+            self.decodefn = lambda x: x
+            self.spath = self.path
+
+        try:
+            # files in .hg/ will be created using this mode
+            mode = os.stat(self.spath).st_mode
+            # avoid some useless chmods
+            if (0777 & ~util._umask) == (0777 & mode):
+                mode = None
+        except OSError:
+            mode = None
+
+        self._createmode = mode
+        self.opener.createmode = mode
+
+        _spath = self.spath
+        _encodefn = self.encodefn
+        class long_name_opener(opener):
+            def __init__(self):
+                opener.__init__(self, _spath)
+                self.createmode = mode
+            def __call__(self, path, mode='r', text=False, atomictemp=False):
+                encpath = _encodefn(path, write=('w' in mode or 'a' in mode))
+                return opener.__call__(self, encpath, mode, text, atomictemp)
+        self.sopener = long_name_opener()
 
     def __getattr__(self, name):
         if name == 'changelog':
diff --git a/mercurial/statichttprepo.py b/mercurial/statichttprepo.py
--- a/mercurial/statichttprepo.py
+++ b/mercurial/statichttprepo.py
@@ -21,14 +21,16 @@
             raise IOError(num, inst)
         except urllib2.URLError, inst:
             raise IOError(None, inst.reason[1])
+    def __iter__(self):
+        return self.read().splitlines(True).__iter__()
 
-def opener(base):
+class opener:
     """return a function that opens files over http"""
-    p = base
-    def o(path, mode="r"):
-        f = "/".join((p, urllib.quote(path)))
+    def __init__(self, base):
+        self.p = base
+    def __call__(self, path, mode="r", text=False, atomictemp=False):
+        f = "/".join((self.p, urllib.quote(path)))
         return rangereader(f)
-    return o
 
 class statichttprepository(localrepo.localrepository):
     def __init__(self, ui, path):
@@ -53,16 +55,7 @@
             if r not in self.supported:
                 raise repo.RepoError(_("requirement '%s' not supported") % r)
 
-        # setup store
-        if "store" in requirements:
-            self.encodefn = util.encodefilename
-            self.decodefn = util.decodefilename
-            self.spath = self.path + "/store"
-        else:
-            self.encodefn = lambda x: x
-            self.decodefn = lambda x: x
-            self.spath = self.path
-        self.sopener = util.encodedopener(opener(self.spath), self.encodefn)
+        self._setup_store(requirements, opener, (lambda a, b: a + '/' + b))
 
         self.manifest = manifest.manifest(self.sopener)
         self.changelog = changelog.changelog(self.sopener)
diff --git a/mercurial/util.py b/mercurial/util.py
--- a/mercurial/util.py
+++ b/mercurial/util.py
@@ -1373,11 +1373,6 @@
 
 encodefilename, decodefilename = _buildencodefun()
 
-def encodedopener(openerfn, fn):
-    def o(path, *args, **kw):
-        return openerfn(fn(path), *args, **kw)
-    return o
-
 def mktempcopy(name, emptyok=False, createmode=None):
     """Create a temporary file with the same contents from name
 
diff --git a/tests/test-long-filenames b/tests/test-long-filenames
new file mode 100755
--- /dev/null
+++ b/tests/test-long-filenames
@@ -0,0 +1,40 @@
+#!/bin/sh
+hg init
+echo content > 'An extremely long file name which I would not expect you would normally create as such, but which can easily arise when working with deeply nested directories.'
+hg -q add
+hg ci -m added
+find .hg | cut -c151- | egrep . || true
+hg ann An*
+hg ren An* shorter-file
+hg ci -m renamed
+find .hg | cut -c151- | egrep . || true
+hg ann shorter-file
+wc -l .hg/store/longnames
+fgrep longnames .hg/requires
+hg clone . clone
+cd clone
+hg up -r0
+ls | wc
+find .hg | cut -c151- | egrep . || true
+
+mkdir -p long-old-repo/.hg/store/data
+# to recreate: find long-old-repo -type f | ruby -ne 'f = $_.chomp; puts "printf \x27#{IO.read(f).gsub(%r{\x27|\x5C|\x22|\x24|[^ -~]}) {|c| format %q{\\x%02X}, c[0]}}\x27 > #{f}"'
+# but Bourne shell printf does not seem to work correctly
+python -c "print '\x00\x00\x00\x02 dummy changelog to prevent using the old repo layout'," > long-old-repo/.hg/00changelog.i
+python -c "print '\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x00\x09\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFFq\x08B\x14\x18 at J\x93|hM\x24y\xA3J\x24\xD2\xCEGW\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00ucontent\x0A'," > long-old-repo/.hg/store/data/xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.i
+python -c "print '\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x005\x00\x00\x00\xF2\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFFA\xC2\xCDaQY\x92j]\xF1N\xB7X\x0B\x033\x07;\xA3{\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00x\x9C\xAB\xA8\x18\x1E\x80\xC1\xDC\xD0\xC0\xC2\xC4\xC8\xD0\xC4\xD0\xC2\xC4\xC0\x24\xD1\xD2\xD8<\xD9\xCC\xC2\x24\xC5\xC8\xC4\xDC2\xD1\xD8\x24\xD1\x08\xC8LN5175\xE7\x02\x00\xF8ygn'," > long-old-repo/.hg/store/00manifest.i
+python -c "print '\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x00B\x00\x00\x00\xFF\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x91\xDB2\x9A\xAE\x96\xCBO\x132\x940/i|k\x02\x1E\xD6\xBE\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00x\x9C31L6JN13454\xB5\xB442K4MI34IM27\xB50H2066607NJ46O\xE2*I-.\xE12P0\xE0\xAA\x18&\x80\x8B+/\xB5\x1C\x00\x1A\x8Ek\xCE'," > long-old-repo/.hg/store/00changelog.i
+python -c "print 'revlogv1\x0Astore\x0A'," > long-old-repo/.hg/requires
+cd long-old-repo
+ls .hg/store/data
+hg co
+hg ann x*
+hg clone . dupe1
+cd dupe1
+ls .hg/store/data
+hg ann x*
+cd ..
+hg clone --pull . dupe2
+cd dupe2
+ls .hg/store/data
+hg ann x*
diff --git a/tests/test-long-filenames.out b/tests/test-long-filenames.out
new file mode 100644
--- /dev/null
+++ b/tests/test-long-filenames.out
@@ -0,0 +1,24 @@
+0: content
+1: content
+1 .hg/store/longnames
+longnames
+updating working directory
+1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+1 files updated, 0 files merged, 1 files removed, 0 files unresolved
+      1      27     160
+xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.i
+1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+0: content
+updating working directory
+1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.i
+0: content
+requesting all changes
+adding changesets
+adding manifests
+adding file changes
+added 1 changesets with 1 changes to 1 files
+updating working directory
+1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+_=66b3fcadc739900b2fcd56ef5731d19d78481850-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.i
+0: content


More information about the Mercurial-devel mailing list