[PATCH] Add script to rewrite manifest to workaround lack of parent deltas

Greg Ward greg-hg at gerg.ca
Mon Aug 24 16:16:18 CDT 2009


# HG changeset patch
# User Greg Ward <greg-hg at gerg.ca>
# Date 1233047576 0
# Node ID 9235025c8eb4219a045b8c9d3e95ce5079b2488d
# Parent  20ed9909dbd9251f76cacb463c480488eebf736b
Add script to rewrite manifest to workaround lack of parent deltas.

Based on a patch to rewrite-log by Benoit Boissinot that I found here:
  http://article.gmane.org/gmane.comp.version-control.mercurial.general/11908

diff --git a/contrib/shrink-manifest.py b/contrib/shrink-manifest.py
new file mode 100755
--- /dev/null
+++ b/contrib/shrink-manifest.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+
+"""\
+Reorder the manifest file in the current repository to save space.
+Specifically, this topologically sorts the revisions in the manifest so
+that revisions on the same branch are adjacent as much as possible.  This
+is a workaround for the fact that Mercurial computes deltas relative to the
+previous revision rather than relative to a parent revision.
+"""
+
+# Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
+# as a patch to rewrite-log.  Cleaned up, refactored, documented, and
+# renamed by Greg Ward <greg at gerg.ca>.
+
+# XXX would be nice to have a way to verify the repository after shrinking,
+# e.g. by comparing "before" and "after" states of random changesets
+# (maybe: export before, shrink, export after, diff).
+
+import sys, os, tempfile
+from mercurial import ui as ui_, hg, revlog, transaction, node, util
+
+def toposort(rl):
+    write = sys.stdout.write
+
+    children = {}
+    root = []
+    # build children and roots
+    write('reading %d revs ' % len(rl))
+    try:
+        for i in rl:
+            children[i] = []
+            parents = [p for p in rl.parentrevs(i) if p != node.nullrev]
+            for p in parents:
+                assert p in children
+                children[p].append(i)
+
+            if len(parents) == 0:
+                root.append(i)
+
+            if i % 1000 == 0:
+                write('.')
+    finally:
+        write('\n')
+
+    # XXX this is a reimplementation of the 'branchsort' topo sort
+    # algorithm in hgext.convert.convcmd... would be nice not to duplicate
+    # the algorithm
+    write('sorting ...')
+    visit = root
+    ret = []
+    while visit:
+        i = visit.pop(0)
+        ret.append(i)
+        if i not in children:
+            # This only happens if some node's p1 == p2, which can
+            # happen in the manifest in certain circumstances.
+            continue
+        next = []
+        for c in children.pop(i):
+            parents_unseen = [p for p in rl.parentrevs(c)
+                              if p != node.nullrev and p in children]
+            if len(parents_unseen) == 0:
+                next.append(c)
+        visit = next + visit
+    write('\n')
+    return ret
+
+def writerevs(r1, r2, order, tr):
+    write = sys.stdout.write
+    write('writing %d revs ' % len(order))
+    try:
+        count = 0
+        for rev in order:
+            n = r1.node(rev)
+            p1, p2 = r1.parents(n)
+            l = r1.linkrev(rev)
+            t = r1.revision(n)
+            n2 = r2.addrevision(t, tr, l, p1, p2)
+
+            if count % 1000 == 0:
+                write('.')
+            count += 1
+    finally:
+        write('\n')
+    
+def report(olddatafn, newdatafn):
+    oldsize = float(os.stat(olddatafn).st_size)
+    newsize = float(os.stat(newdatafn).st_size)
+
+    # argh: have to pass an int to %d, because a float >= 2^32 
+    # blows up under Python 2.5 or earlier
+    sys.stdout.write('old file size: %12d bytes (%6.1f MiB)\n'
+                     % (int(oldsize), oldsize/1024/1024))
+    sys.stdout.write('new file size: %12d bytes (%6.1f MiB)\n'
+                     % (int(newsize), newsize/1024/1024))
+
+    shrink_percent = (oldsize - newsize) / oldsize * 100
+    shrink_factor = oldsize / newsize
+    sys.stdout.write('shrinkage: %.1f%% (%.1fx)\n'
+                     % (shrink_percent, shrink_factor))
+
+def main():
+
+    # Unbuffer stdout for nice progress output.
+    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
+    write = sys.stdout.write
+
+    # Open the local repository.
+    ui = ui_.ui()
+    repo = hg.repository(ui)
+
+    indexfn = repo.sjoin('00manifest.i')
+    datafn = indexfn[:-2] + '.d'
+    if not os.path.exists(datafn):
+        sys.exit('error: %s does not exist: manifest not big enough '
+                 'to be worth shrinking' % datafn)
+
+    oldindexfn = indexfn + '.old'
+    olddatafn = datafn + '.old'
+    if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
+        sys.exit('error: one or both of\n'
+                 '  %s\n'
+                 '  %s\n'
+                 'exists from a previous run; please clean up before '
+                 'running again'
+                 % (oldindexfn, olddatafn))
+
+    (tmpfd, tmpindexfn) = tempfile.mkstemp(dir=repo.sjoin(''),
+                                           prefix='00manifest.',
+                                           suffix='.i')
+    tmpdatafn = tmpindexfn[:-2] + '.d'
+    os.close(tmpfd)
+
+    r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn)
+    r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn)
+
+    # Don't use repo.transaction(), because then things get hairy with
+    # paths: some need to be relative to .hg, and some need to be
+    # absolute.  Doing it this way keeps things simple: everything is an
+    # absolute path.
+    lock = repo.lock(wait=False)
+    tr = transaction.transaction(sys.stderr.write,
+                                 open,
+                                 repo.sjoin('journal'))
+
+    try:
+        try:
+            order = toposort(r1)
+            writerevs(r1, r2, order, tr)
+            report(datafn, tmpdatafn)
+            tr.close()
+        except:
+            # Abort transaction first, so we truncate the files before
+            # deleting them.
+            tr.abort()
+            if os.path.exists(tmpindexfn):
+                os.unlink(tmpindexfn)
+            if os.path.exists(tmpdatafn):
+                os.unlink(tmpdatafn)
+            raise
+    finally:
+        lock.release()
+
+    os.link(indexfn, oldindexfn)
+    os.link(datafn, olddatafn)
+    os.rename(tmpindexfn, indexfn)
+    os.rename(tmpdatafn, datafn)
+    write('note: old manifest saved in:\n'
+          '  %s\n'
+          '  %s\n'
+          '(You can delete those files when you are satisfied that your\n'
+          'repository is still sane.)\n'
+          % (oldindexfn, olddatafn))
+
+try:
+    main()
+except KeyboardInterrupt:
+    sys.exit("interrupted")
+


More information about the Mercurial-devel mailing list