Solving long paths by hashing

Jesse Glick jesse.glick at sun.com
Mon Jun 30 10:52:20 CDT 2008


Adrian Buehlmann wrote:
> I'll post Jesse's patch below so you can read and comment it inline:

I actually had an updated version of the patch. (I think I attached an 
initial version to the bug report but was asked to post to the list, and 
wound up making some improvements.) It's been too long for me to 
remember the exact differences, but there seems to be some refactoring 
and more extensive tests.



Issue #839: prevent any file paths under .hg/store/data/ from getting 
too long.
If a working copy file has a path which is very long and uses many 
underscores
or uppercase letters, the repo could be especially long. Since Windows 
imposes a
path length maximum around 256 characters, this can make the repository
impossible to check out for Windows users. Worse, you cannot correct the 
problem
using 'hg ren' to shorten path names, since the old repo files will never be
deleted! Fix is to check for potentially overlong repo names. If found, 
truncate
them, inject a hash of the full path to prevent clashes, and write the 
full path
to .hg/store/longnames so it can be recovered in the future. Since this 
is an
incompatible repository format change, it has to be represented as a new
requirement.

diff -r aea4c666b5c3 mercurial/hg.py
--- a/mercurial/hg.py	Wed Dec 19 14:26:19 2007 -0500
+++ b/mercurial/hg.py	Wed Dec 19 15:35:54 2007 -0500
@@ -198,6 +198,7 @@ def clone(ui, source, dest=None, pull=Fa
              dest_lock = lock.lock(os.path.join(dest_store, "lock"))

              files = ("data",
+                     "longnames",
                       "00manifest.d", "00manifest.i",
                       "00changelog.d", "00changelog.i")
              for f in files:
diff -r aea4c666b5c3 mercurial/localrepo.py
--- a/mercurial/localrepo.py	Wed Dec 19 14:26:19 2007 -0500
+++ b/mercurial/localrepo.py	Wed Dec 19 15:35:54 2007 -0500
@@ -11,10 +11,11 @@ import changelog, dirstate, filelog, man
  import changelog, dirstate, filelog, manifest, context, weakref
  import re, lock, transaction, tempfile, stat, errno, ui
  import os, revlog, time, util, extensions, hook
+import sha

  class localrepository(repo.repository):
      capabilities = util.set(('lookup', 'changegroupsubset'))
-    supported = ('revlogv1', 'store')
+    supported = ('revlogv1', 'store', 'longnames')

      def __init__(self, parentui, path=None, create=0):
          repo.repository.__init__(self)
@@ -59,17 +60,7 @@ class localrepository(repo.repository):
              if r not in self.supported:
                  raise repo.RepoError(_("requirement '%s' not 
supported") % r)

-        # setup store
-        if "store" in requirements:
-            self.encodefn = util.encodefilename
-            self.decodefn = util.decodefilename
-            self.spath = os.path.join(self.path, "store")
-        else:
-            self.encodefn = lambda x: x
-            self.decodefn = lambda x: x
-            self.spath = self.path
-        self.sopener = util.encodedopener(util.opener(self.spath),
-                                          self.encodefn)
+        self._setup_store(requirements, util.opener, os.path.join)

          self.ui = ui.ui(parentui=parentui)
          try:
@@ -84,6 +75,80 @@ class localrepository(repo.repository):
          self.nodetagscache = None
          self.filterpats = {}
          self._transref = self._lockref = self._wlockref = None
+
+    def _setup_store(self, requirements, opener, pathjoiner):
+        if "store" in requirements:
+            self._longnames = None
+            def load_longnames():
+                if self._longnames == None:
+                    self._longnames = {}
+                    self._longnames_transient = {}
+                    try:
+                        self._longnames_file = 
opener(self.spath)('longnames',
+ 
mode='a+')
+                        for line in self._longnames_file:
+                            datapath = line[0:-1]
+                            sha1 = sha.new(datapath).hexdigest()
+                            self._longnames[sha1] = datapath
+                    except IOError, err:
+                        if err.errno != errno.ENOENT:
+                            raise
+            maxlen = 150 - len('.hg/store/')
+            def encode(s, write=False):
+                r = util.encodefilename(s)
+                if not s.startswith('data/'):
+                    return r
+                if len(r) <= maxlen:
+                    return r
+                try:
+                    opener(self.spath)(r, "rb").close()
+                    return r
+                except IOError:
+                    pass
+                datapath = s[5:]
+                sha1 = sha.new(datapath).hexdigest()
+                load_longnames()
+                if not sha1 in self._longnames:
+                    if write:
+                        self._longnames[sha1] = datapath
+                        self._longnames_file.write(datapath + '\n')
+                        self._longnames_file.flush()
+                        reqfile = self.opener("requires", "a+")
+                        if 'longnames\n' not in 
reqfile.read().splitlines():
+                            reqfile.write('longnames\n')
+                            reqfile.close()
+                        else:
+                            self._longnames_transient[sha1] = datapath
+                return 'data/_=' + sha1 + '-' + \
+                    re.sub(r"[^a-z0-9.-]", '_', 
datapath[48-maxlen:].lower())
+            self.encodefn = encode
+            def decode(s):
+                r = util.decodefilename(s)
+                if s.startswith('data/_='):
+                    sha1 = s[7:].split('-', 2)[0]
+                    load_longnames()
+                    if sha1 in self._longnames:
+                        r = 'data/' + self._longnames[sha1]
+                    else:
+                        r = 'data/' + self._longnames_transient[sha1]
+                return r
+            self.decodefn = decode
+            self.spath = pathjoiner(self.path, "store")
+        else:
+            def encode(s, write=False):
+                return s
+            self.encodefn = encode
+            self.decodefn = lambda x: x
+            self.spath = self.path
+        _spath = self.spath
+        _encodefn = self.encodefn
+        class long_name_opener(opener):
+            def __init__(self):
+                opener.__init__(self, _spath)
+            def __call__(self, path, mode='r', text=False, 
atomictemp=False):
+                encpath = _encodefn(path, write=('w' in mode or 'a' in 
mode))
+                return opener.__call__(self, encpath, mode, text, 
atomictemp)
+        self.sopener = long_name_opener()

      def __getattr__(self, name):
          if name == 'changelog':
diff -r aea4c666b5c3 mercurial/statichttprepo.py
--- a/mercurial/statichttprepo.py	Wed Dec 19 14:26:19 2007 -0500
+++ b/mercurial/statichttprepo.py	Wed Dec 19 15:35:54 2007 -0500
@@ -19,14 +19,16 @@ class rangereader(httprangereader.httpra
              raise IOError(None, inst)
          except urllib2.URLError, inst:
              raise IOError(None, inst.reason[1])
+    def __iter__(self):
+        return self.read().splitlines(True).__iter__()

-def opener(base):
+class opener:
      """return a function that opens files over http"""
-    p = base
-    def o(path, mode="r"):
-        f = "/".join((p, urllib.quote(path)))
+    def __init__(self, base):
+        self.p = base
+    def __call__(self, path, mode="r", text=False, atomictemp=False):
+        f = "/".join((self.p, urllib.quote(path)))
          return rangereader(f)
-    return o

  class statichttprepository(localrepo.localrepository):
      def __init__(self, ui, path):
@@ -45,16 +47,7 @@ class statichttprepository(localrepo.loc
              if r not in self.supported:
                  raise repo.RepoError(_("requirement '%s' not 
supported") % r)

-        # setup store
-        if "store" in requirements:
-            self.encodefn = util.encodefilename
-            self.decodefn = util.decodefilename
-            self.spath = self.path + "/store"
-        else:
-            self.encodefn = lambda x: x
-            self.decodefn = lambda x: x
-            self.spath = self.path
-        self.sopener = util.encodedopener(opener(self.spath), 
self.encodefn)
+        self._setup_store(requirements, opener, (lambda a, b: a + '/' + b))

          self.manifest = manifest.manifest(self.sopener)
          self.changelog = changelog.changelog(self.sopener)
diff -r aea4c666b5c3 mercurial/util.py
--- a/mercurial/util.py	Wed Dec 19 14:26:19 2007 -0500
+++ b/mercurial/util.py	Wed Dec 19 15:35:54 2007 -0500
@@ -1292,11 +1292,6 @@ def _buildencodefun():

  encodefilename, decodefilename = _buildencodefun()

-def encodedopener(openerfn, fn):
-    def o(path, *args, **kw):
-        return openerfn(fn(path), *args, **kw)
-    return o
-
  def mktempcopy(name, emptyok=False):
      """Create a temporary file with the same contents from name

diff -r aea4c666b5c3 tests/test-long-filenames
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-long-filenames	Wed Dec 19 15:35:54 2007 -0500
@@ -0,0 +1,40 @@
+#!/bin/sh
+hg init
+echo content > 'An extremely long file name which I would not expect 
you would normally create as such, but which can easily arise when 
working with deeply nested directories.'
+hg -q add
+hg ci -m added
+find .hg | cut -c151- | egrep . || true
+hg ann An*
+hg ren An* shorter-file
+hg ci -m renamed
+find .hg | cut -c151- | egrep . || true
+hg ann shorter-file
+wc -l .hg/store/longnames
+fgrep longnames .hg/requires
+hg clone . clone
+cd clone
+hg up -r0
+ls | wc
+find .hg | cut -c151- | egrep . || true
+
+mkdir -p long-old-repo/.hg/store/data
+# to recreate: find long-old-repo -type f | ruby -ne 'f = $_.chomp; 
puts "printf \x27#{IO.read(f).gsub(%r{\x27|\x5C|\x22|\x24|[^ -~]}) {|c| 
format %q{\\x%02X}, c[0]}}\x27 > #{f}"'
+# but Bourne shell printf does not seem to work correctly
+python -c "print '\x00\x00\x00\x02 dummy changelog to prevent using the 
old repo layout'," > long-old-repo/.hg/00changelog.i
+python -c "print 
'\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x00\x09\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFFq\x08B\x14\x18 at J\x93|hM\x24y\xA3J\x24\xD2\xCEGW\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00ucontent\x0A'," 
 > 
long-old-repo/.hg/store/data/xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.i
+python -c "print 
'\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x005\x00\x00\x00\xF2\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFFA\xC2\xCDaQY\x92j]\xF1N\xB7X\x0B\x033\x07;\xA3{\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00x\x9C\xAB\xA8\x18\x1E\x80\xC1\xDC\xD0\xC0\xC2\xC4\xC8\xD0\xC4\xD0\xC2\xC4\xC0\x24\xD1\xD2\xD8<\xD9\xCC\xC2\x24\xC5\xC8\xC4\xDC2\xD1\xD8\x24\xD1\x08\xC8LN5175\xE7\x02\x00\xF8ygn'," 
 > long-old-repo/.hg/store/00manifest.i
+python -c "print 
'\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x00B\x00\x00\x00\xFF\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x91\xDB2\x9A\xAE\x96\xCBO\x132\x940/i|k\x02\x1E\xD6\xBE\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00x\x9C31L6JN13454\xB5\xB442K4MI34IM27\xB50H2066607NJ46O\xE2*I-.\xE12P0\xE0\xAA\x18&\x80\x8B+/\xB5\x1C\x00\x1A\x8Ek\xCE'," 
 > long-old-repo/.hg/store/00changelog.i
+python -c "print 'revlogv1\x0Astore\x0A'," > long-old-repo/.hg/requires
+cd long-old-repo
+ls .hg/store/data
+hg co
+hg ann x*
+hg clone . dupe1
+cd dupe1
+ls .hg/store/data
+hg ann x*
+cd ..
+hg clone --pull . dupe2
+cd dupe2
+ls .hg/store/data
+hg ann x*
diff -r aea4c666b5c3 tests/test-long-filenames.out
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-long-filenames.out	Wed Dec 19 15:35:54 2007 -0500
@@ -0,0 +1,21 @@
+0: content
+1: content
+1 .hg/store/longnames
+longnames
+1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+1 files updated, 0 files merged, 1 files removed, 0 files unresolved
+      1      27     160
+xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.i
+1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+0: content
+1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.i
+0: content
+requesting all changes
+adding changesets
+adding manifests
+adding file changes
+added 1 changesets with 1 changes to 1 files
+1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+_=66b3fcadc739900b2fcd56ef5731d19d78481850-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.i
+0: content



More information about the Mercurial-devel mailing list