[PATCH 1 of 1] Issue919: add a standard extension to recreate hardlinks between repositories

Jesse Glick typrase at gmail.com
Thu Nov 5 16:43:58 CST 2009


# HG changeset patch
# User Jesse Glick <jesse.glick at sun.com>
# Date 1257460683 18000
# Node ID 8dbb7387f6fc25098572ff38c8a701e388de5d6b
# Parent  8773d46861f5cd26bf629228c0e3ae4378b001c7
Issue919: add a standard extension to recreate hardlinks between repositories.
Having to run a standalone Python script from the contrib dir is a nuisance.
Also makes various improvements to locking, file discovery, etc.
Should also update: http://www.selenic.com/mercurial/wiki/index.cgi/RecreateHardlinksBetweenRepositories

diff --git a/contrib/hg-relink b/hgext/relink.py
rename from contrib/hg-relink
rename to hgext/relink.py
--- a/contrib/hg-relink
+++ b/hgext/relink.py
@@ -1,52 +1,74 @@
-#!/usr/bin/env python
+# Mercurial extension to provide 'hg relink' command
 #
 # Copyright (C) 2007 Brendan Cully <brendan at kublai.com>
 #
 # This software may be used and distributed according to the terms of the
 # GNU General Public License version 2, incorporated herein by reference.
 
-import os, sys
+"""recreates hardlinks between repository clones"""
 
-class ConfigError(Exception): pass
+from mercurial import cmdutil, hg, util
+from mercurial.i18n import _
+import os, stat
 
-def usage():
-    print """relink <source> <destination>
-    Recreate hard links between source and destination repositories"""
+def relink(ui, repo, origin=None, **opts):
+    """recreate hardlinks between two repositories
 
-class Config:
-    def __init__(self, args):
-        if len(args) != 3:
-            raise ConfigError("wrong number of arguments")
-        self.src = os.path.abspath(args[1])
-        self.dst = os.path.abspath(args[2])
-        for d in (self.src, self.dst):
-            if not os.path.exists(os.path.join(d, '.hg')):
-                raise ConfigError("%s: not a mercurial repository" % d)
+    When repositories are cloned locally, their data files will be hardlinked
+    so that they only use the space of a single repository.
 
-def collect(src):
+    Unfortunately, subsequent pulls into either repository will break hardlinks
+    for any files touched by the new changesets, even if both repositories end
+    up pulling the same changes.
+
+    Similarly, passing --rev to "hg clone" will fail to use
+    any hardlinks, falling back to a complete copy of the source repository.
+
+    This command lets you recreate those hardlinks and reclaim that wasted
+    space.
+
+    This repository will be relinked to share space with ORIGIN, which must be
+    on the same local disk. If ORIGIN is omitted, looks for "default-relink",
+    then "default", in [paths].
+
+    Do not attempt any read operations on this repository while the command is
+    running. (Both repositories will be locked against writes.)
+    """
+    src = hg.repository(
+        cmdutil.remoteui(repo, opts),
+        ui.expandpath(origin or 'default-relink', origin or 'default'))
+    if not src.local():
+        raise util.Abort('must specify local origin repository')
+    ui.status(_('relinking %s to %s\n') % (src.store.path, repo.store.path))
+    locallock = repo.lock()
+    try:
+        remotelock = src.lock()
+        try:
+            candidates = collect(src.store.path, ui)
+            targets = prune(candidates, repo.store.path, ui)
+            do_relink(src.store.path, repo.store.path, targets, ui)
+        finally:
+            remotelock.release()
+    finally:
+        locallock.release()
+
+def collect(src, ui):
     seplen = len(os.path.sep)
     candidates = []
     for dirpath, dirnames, filenames in os.walk(src):
         relpath = dirpath[len(src) + seplen:]
         for filename in filenames:
-            if not filename.endswith('.i'):
+            if not filename[-2:] in ('.d', '.i'):
                 continue
             st = os.stat(os.path.join(dirpath, filename))
+            if not stat.S_ISREG(st.st_mode):
+                continue
             candidates.append((os.path.join(relpath, filename), st))
 
+    ui.status(_('collected %d candidate storage files\n') % len(candidates))
     return candidates
 
-def prune(candidates, dst):
-    def getdatafile(path):
-        if not path.endswith('.i'):
-            return None, None
-        df = path[:-1] + 'd'
-        try:
-            st = os.stat(df)
-        except OSError:
-            return None, None
-        return df, st
-
+def prune(candidates, dst, ui):
     def linkfilter(dst, st):
         try:
             ts = os.stat(dst)
@@ -57,9 +79,9 @@
             return False
         if st.st_dev != ts.st_dev:
             # No point in continuing
-            raise Exception('Source and destination are on different devices')
+            raise util.Abort(
+                _('source and destination are on different devices'))
         if st.st_size != ts.st_size:
-            # TODO: compare revlog heads
             return False
         return st
 
@@ -68,15 +90,14 @@
         tgt = os.path.join(dst, fn)
         ts = linkfilter(tgt, st)
         if not ts:
+            ui.debug(_('not linkable: %s\n') % fn)
             continue
         targets.append((fn, ts.st_size))
-        df, ts = getdatafile(tgt)
-        if df:
-            targets.append((fn[:-1] + 'd', ts.st_size))
 
+    ui.status(_('pruned down to %d probably relinkable files\n') % len(targets))
     return targets
 
-def relink(src, dst, files):
+def do_relink(src, dst, files, ui):
     def relinkfile(src, dst):
         bak = dst + '.bak'
         os.rename(dst, bak)
@@ -91,7 +112,10 @@
     relinked = 0
     savedbytes = 0
 
+    pos = 0
+    total = len(files)
     for f, sz in files:
+        pos += 1
         source = os.path.join(src, f)
         tgt = os.path.join(dst, f)
         sfp = file(source)
@@ -103,26 +127,23 @@
                 break
             sin = sfp.read(CHUNKLEN)
         if sin:
+            ui.debug(_('not linkable: %s\n') % f)
             continue
         try:
             relinkfile(source, tgt)
-            print 'Relinked %s' % f
+            ui.progress(_('relink'), pos, f, _(' files'), total)
             relinked += 1
             savedbytes += sz
         except OSError, inst:
-            print '%s: %s' % (tgt, str(inst))
+            ui.warn(_('%s: %s\n') % (tgt, str(inst)))
 
-    print 'Relinked %d files (%d bytes reclaimed)' % (relinked, savedbytes)
+    ui.status(_('relinked %d files (%d bytes reclaimed)\n') %
+              (relinked, savedbytes))
 
-try:
-    cfg = Config(sys.argv)
-except ConfigError, inst:
-    print str(inst)
-    usage()
-    sys.exit(1)
-
-src = os.path.join(cfg.src, '.hg')
-dst = os.path.join(cfg.dst, '.hg')
-candidates = collect(src)
-targets = prune(candidates, dst)
-relink(src, dst, targets)
+cmdtable = {
+    'relink': (
+        relink,
+        [],
+        _('[ORIGIN]')
+    )
+}
diff --git a/mercurial/commands.py b/mercurial/commands.py
--- a/mercurial/commands.py
+++ b/mercurial/commands.py
@@ -618,7 +618,7 @@
     in the destination.
 
     Using -r/--rev (or 'clone src#rev dest') implies --pull, even for
-    local source repositories.
+    local source repositories. Use relink extension to reclaim hardlinks.
 
     For efficiency, hardlinks are used for cloning whenever the source
     and destination are on the same filesystem (note this applies only


More information about the Mercurial-devel mailing list