[PATCH 3 of 3] Add splitrepo command to the convert extension

Alexis S. L. Carvalho alexis at cecm.usp.br
Sat Sep 1 01:56:38 CDT 2007


# HG changeset patch
# User Alexis S. L. Carvalho <alexis at cecm.usp.br>
# Date 1188629314 10800
# Node ID a9d7dd149cb2aaf23f800853ea737c9594a8602e
# Parent  14b43c5dfd0035b7d8e6e35dfe72cb5157a3482b
Add splitrepo command to the convert extension.

The --filemap support in hg convert doesn't handle merges correctly.
(And after 33015dac5df5 I managed to break it even for simple cases
where we don't want the first revision.)

I don't think it's possible to correctly handle merges without some help
from the converter_source - for starters, to decide if we want to
include a merge, we want to look at the files that it has changed
compared to all parents, but current implementations of getchanges
return the files that have changed compared to either one parent or any
parent.

So I've added a separate converter_source that reads hg repositories
and does the work necessary to calculate the subgraph we're interested
in, and added a splitrepo command that does a conversion using this
special converter_source.

The implementation depends on the ability to skip some revisions and to
change the parents field of the commit objects that we returned earlier.
We could avoid this rewrite by calculating the subgraph we're interested
in in getheads/getcommit, but that would be very slow, since we'd have
to do pretty much all the work that getchanges does while walking
back in the graph.

diff -r 14b43c5dfd00 -r a9d7dd149cb2 hgext/convert/__init__.py
--- a/hgext/convert/__init__.py	Sat Sep 01 03:48:34 2007 -0300
+++ b/hgext/convert/__init__.py	Sat Sep 01 03:48:34 2007 -0300
@@ -10,12 +10,13 @@ from git import convert_git
 from git import convert_git
 from hg import mercurial_source, mercurial_sink
 from subversion import convert_svn, debugsvnlog
+import splitrepo
 
 import os, shlex, shutil
 from mercurial import hg, ui, util, commands
 from mercurial.i18n import _
 
-commands.norepo += " convert debugsvnlog"
+commands.norepo += " convert splitrepo debugsvnlog"
 
 converters = [convert_cvs, convert_git, convert_svn, mercurial_source,
               mercurial_sink]
@@ -400,12 +401,14 @@ def _convert(ui, src, dest, revmapfile, 
 
     destc = convertsink(ui, dest)
 
-    try:
-        srcc = convertsource(ui, src, rev=opts.get('rev'))
-    except Exception:
-        if created:
-            shutil.rmtree(dest, True)
-        raise
+    srcc = src
+    if isinstance(src, basestring):
+        try:
+            srcc = convertsource(ui, src, rev=opts.get('rev'))
+        except Exception:
+            if created:
+                shutil.rmtree(dest, True)
+            raise
 
     if not revmapfile:
         try:
@@ -472,6 +475,42 @@ def convert(ui, src, dest=None, revmapfi
     """
     return _convert(ui, src, dest, revmapfile, opts)
 
+def _splitrepo(ui, src, dest=None, revmapfile=None, **opts):
+    """split a repository on a per-file basis, preserving history
+
+    Split a repository following the rules present in the file given
+    with the --filemap mandatory option.  This will add to the target
+    repository only the revisions necessary to preserve the history
+    of the specified files.
+
+    See the help for hg convert for information about other options
+    and the revmapfile.
+
+    The filemap is a file that allows filtering and remapping of files
+    and directories.  Comment lines start with '#'.  Each line can
+    contain one of the following directives:
+
+      include path/to/file
+
+      exclude path/to/file
+
+      rename from/file to/file
+    
+    The 'include' directive causes a file, or all files under a
+    directory, to be included in the destination repository.  The
+    'exclude' directive causes files or directories to be omitted.
+    The 'rename' directive renames a file or directory.  To rename
+    from a subdirectory into the root of the repository, use '.' as
+    the path to rename to.
+    """
+
+    if not opts.get('filemap'):
+        raise util.Abort(_('you have to specify a filemap file'))
+
+    fmap = filemapper(ui, opts['filemap'])
+    opts['filemap'] = ''
+    srcc = splitrepo.splitrepo(ui, src, fmap, opts.get('rev'))
+    return _convert(ui, srcc, dest, revmapfile, opts)
 
 cmdtable = {
     "convert":
@@ -481,6 +520,13 @@ cmdtable = {
           ('r', 'rev', '', 'import up to target revision REV'),
           ('', 'datesort', None, 'try to sort changesets by date')],
          'hg convert [OPTION]... SOURCE [DEST [MAPFILE]]'),
+    "splitrepo":
+        (_splitrepo,
+         [('A', 'authors', '', 'username mapping filename'),
+          ('f', 'filemap', '', 'remap file names using contents of file'),
+          ('r', 'rev', '', 'import up to target revision REV'),
+          ('', 'datesort', None, 'try to sort changesets by date')],
+         'hg splitrepo --filemap FILEMAP [OPTION]... SOURCE [DEST [MAPFILE]]'),
     "debugsvnlog":
         (debugsvnlog,
          [],
diff -r 14b43c5dfd00 -r a9d7dd149cb2 hgext/convert/splitrepo.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext/convert/splitrepo.py	Sat Sep 01 03:48:34 2007 -0300
@@ -0,0 +1,245 @@
+# splitrepo.py - Split a repository, keeping the history
+#
+# Copyright (C) 2007 Alexis S. L. Carvalho <alexis at cecm.usp.br>
+#
+# This software may be used and distributed according to the terms
+# of the GNU General Public License, incorporated herein by reference.
+
+from common import SKIPREV
+from hg import mercurial_source
+from mercurial import util
+from mercurial.node import *
+
+# This class does two additional things compared to a regular source:
+#
+# - Filter and rename files.  This is mostly wrapped by filemapper.
+#   We hide the original filename in the revision that is returned by
+#   getchanges to be able to find things later in getfile and getmode.
+#
+# - Return only revisions that matter for the files we're interested in.
+#   This involves rewriting the parents of the original revision to
+#   create a graph that is restricted to those revisions.
+#
+#   This set of revisions includes not only revisions that directly
+#   touch files we're interested in, but also merges that merge two
+#   interesting revisions.
+#
+#   To find these merges, we keep, for every revision in the original
+#   graph, the set of ancestors of that revision that are also in the
+#   restricted graph.  When we see a merge that we're not directly
+#   interested in, we compare the sets from the parents; if they differ,
+#   we include the merge in the restricted graph.
+
+class splitrepo(mercurial_source):
+    def __init__(self, ui, path, filemapper, rev=None):
+        mercurial_source.__init__(self, ui, path, rev)
+        self.filemapper = filemapper
+        self.commits = {}
+        # if a revision rev has parent p in the original revision graph, then
+        # rev will have parent self.parentmap[p] in the restricted graph.
+        self.parentmap = {}
+        # self.wantedancestors[rev] is the set of all ancestors of rev that
+        # are in the restricted graph.
+        self.wantedancestors = {}
+
+    def setrevmap(self, revmap):
+        # rebuild our state to make things restartable
+        cl = self.repo.changelog
+
+        for r in xrange(cl.count()):
+            rev = hex(cl.node(r))
+            mapped = revmap.get(rev)
+            if mapped is None:
+                continue
+            parents = [hex(cl.node(p)) for p in cl.parentrevs(r)
+                       if p != nullrev]
+            if mapped != SKIPREV:
+                self._mark_wanted(rev, parents)
+            else:
+                self.mark_not_wanted(rev, parents)
+
+    def getcommit(self, rev):
+        # we want to save a reference to the commit objects to be able
+        # to rewrite their parents later.
+        self.commits[rev] = mercurial_source.getcommit(self, rev)
+        return self.commits[rev]
+
+    def _files(self, rev):
+        # returns the files needed to decide if rev is wanted or not
+        #
+        # I.e. the files that have changed compared to ALL parents
+        ctx = self.changectx(rev)
+        parents = [p for p in ctx.parents() if p]
+        if len(parents) != 2:
+            # simple case
+            node1 = parents and parents[0].node() or nullid
+            changes = self.repo.status(node1, ctx.node())[:3]
+            self._changescache = (rev, changes)
+            return changes[0] + changes[1] + changes[2]
+
+        # We're going to need the manifests for rev and its parents.
+        # We try to do something not too dumb to reduce the work
+        # needed to build all 3 manifests, taking into account that :
+        #
+        # - getchanges will need the result of repo.status(p1, rev)
+        #   (at least if it turns out that we want rev)
+        # - we're going to need the manifest for rev to lookup the file
+        #   nodes in that revision
+        #
+        # So we start by reading p2.manifest() and read
+        # p1.manifest()/rev.manifest() right before/after the call to
+        # repo.status.
+        # 
+        p1, p2 = parents
+        manp2 = p2.manifest()
+        manp1 = p1.manifest()
+
+        changes = self.repo.status(p1.node(), ctx.node())[:3]
+
+        man = ctx.manifest()
+        files = changes[0] + changes[1] + changes[2]
+        intersection = [f for f in files
+                        if manp1.get(f) != man.get(f) != manp2.get(f)]
+        self._changescache = (rev, changes)
+        return intersection
+
+    def wanted(self, rev):
+        # Return True if we're directly interested in rev.
+        #
+        # I.e. if rev touched one of the files we're interested in.
+        for f in self._files(rev):
+            if self.filemapper(f):
+                return True
+        return False
+
+    def mark_not_wanted(self, rev, parents):
+        # Mark rev as not interesting and update data structures.
+
+        pmap = self.parentmap
+        wa = self.wantedancestors
+        if not parents:
+            # A root revision. Use None to indicate that it doesn't
+            # have parents in the restricted graph.  Put None in the
+            # set of wanted ancestors to simplify code elsewhere
+            pmap[rev] = None
+            wa[rev] = util.set((None,))
+            return
+
+        # We want to reuse the data from our parent.  If rev is
+        # a merge, then either parentmap[p0] is an ancestor of
+        # parentmap[p1] or vice-versa.  Choose the parent that
+        # is not the ancestor.
+        p = parents[0]
+        if len(parents) == 2 and pmap[p] in wa[parents[1]]:
+            p = parents[1]
+        pmap[rev] = pmap[p]
+        wa[rev] = wa[p]
+
+    def _mark_wanted(self, rev, parents):
+        # rev is wanted. update data structures
+
+        # rev will be in the restricted graph, so children of rev in
+        # the original graph should still have rev as a parent in the
+        # restricted graph.
+        self.parentmap[rev] = rev
+
+        # The set of wanted ancestors of rev is the union of the sets
+        # of wanted ancestors of its parents. Plus rev itself.
+        wa = self.wantedancestors
+        if len(parents) == 2:
+            wrev = wa[parents[0]].union(wa[parents[1]])
+        elif parents:
+            wrev = wa[parents[0]].copy()
+        else:
+            wrev = util.set()
+        wrev.add(rev)
+        wa[rev] = wrev
+
+    def mark_wanted(self, rev, parents):
+        # rev is wanted. rewrite its parents and update data structures
+
+        # Rewrite the parents of the commit object.
+        # Usually it's enough to use parentmap to map our original
+        # parents, but for merges we want to make sure that one
+        # of the mapped parents is not an ancestor of the other.
+        gparents = [self.parentmap[p] for p in parents
+                    if self.parentmap[p] is not None]
+        if len(gparents) == 2:
+            p1, p2 = gparents
+            if p1 == p2 or p2 in self.wantedancestors[p1]:
+                gparents = [p1]
+            elif p1 in self.wantedancestors[p2]:
+                gparents = [p2]
+        self.commits[rev].parents = gparents
+
+        self._mark_wanted(rev, parents)
+
+    def getchanges(self, rev):
+        ctx = self.changectx(rev)
+        revno = ctx.rev()
+        cl = self.repo.changelog
+        parents = [hex(cl.node(p)) for p in cl.parentrevs(revno)
+                   if p != nullrev]
+
+        wa = self.wantedancestors
+        wanted = False
+        if self.wanted(rev):
+            self.mark_wanted(rev, parents)
+            wanted = True
+        elif (len(parents) == 2 and
+              (self.parentmap[parents[0]] not in wa[parents[1]] and
+               self.parentmap[parents[1]] not in wa[parents[0]])):
+            # The test above can also be seen as a cheaper way to compare
+            # wa[parents[0]] and wa[parents[1]]:  revisions are added to
+            # wa[x] in topological order and parentmap[x] is the last
+            # revision that was added.
+            # In any case, rev merges two revisions that are in the restricted
+            # graph, so we also include it.
+            self.mark_wanted(rev, parents)
+            wanted = True
+        else:
+            self.mark_not_wanted(rev, parents)
+
+        if not wanted:
+            # tell the converter to skip this revision
+            return SKIPREV
+
+        changesrev, changes = self._changescache
+        assert changesrev == rev
+        del self._changescache
+
+        # Select the files we're interested in, renaming them
+        # as appropriate.  We store the original name as part of
+        # the file revision that the converter will give back to us
+        # later, so that getfile and getmode will know what to do.
+        newnames = {}
+        def mapfiles(l):
+            newl = []
+            for f in l:
+                newf = self.filemapper(f)
+                if newf:
+                    newl.append((newf, (f, rev)))
+                    newnames[f] = newf
+            return newl
+        m = mapfiles(changes[0])
+        a = mapfiles(changes[1])
+        r = mapfiles(changes[2])
+
+        # Get the copies and rename the sources when necessary.
+        checkcopies = [x[1][0] for x in m + a]
+        copies = self.getcopies(ctx, checkcopies)
+        ncopies = {}
+        for c in copies:
+            newc = self.filemapper(copies[c])
+            if newc:
+                ncopies[newnames[c]] = newc
+        return m + a + r, ncopies
+
+    def getfile(self, name, rev):
+        realname, realrev = rev
+        return mercurial_source.getfile(self, realname, realrev)
+
+    def getmode(self, name, rev):
+        realname, realrev = rev
+        return mercurial_source.getmode(self, realname, realrev)
+
diff -r 14b43c5dfd00 -r a9d7dd149cb2 tests/test-splitrepo
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-splitrepo	Sat Sep 01 03:48:34 2007 -0300
@@ -0,0 +1,120 @@
+#!/bin/sh
+
+HGMERGE=true; export HGMERGE
+
+echo '[extensions]' >> $HGRCPATH
+echo 'hgext.graphlog =' >> $HGRCPATH
+echo 'hgext.convert =' >> $HGRCPATH
+
+glog()
+{
+    hg glog --template '#rev# "#desc#" files: #files#\n' "$@"
+}
+
+hg init source
+cd source
+
+echo foo > foo
+echo baz > baz
+mkdir dir
+echo dir/file >> dir/file
+echo dir/file2 >> dir/file2
+hg ci -d '0 0' -qAm '0: add foo baz dir/'
+
+echo bar > bar
+echo quux > quux
+hg copy foo copied
+hg ci -d '1 0' -qAm '1: add bar quux; copy foo to copied'
+
+echo >> foo
+hg ci -d '2 0' -m '2: change foo'
+
+hg up -qC 1
+echo >> bar
+echo >> quux
+hg ci -d '3 0' -m '3: change bar quux'
+
+hg up -qC 2
+hg merge -qr 3
+echo >> bar
+echo >> baz
+hg ci -d '4 0' -m '4: first merge; change bar baz'
+
+echo >> bar
+echo 1 >> baz
+echo >> quux
+hg ci -d '5 0' -m '5: change bar baz quux'
+
+hg up -qC 4
+echo >> foo
+echo 2 >> baz
+hg ci -d '6 0' -m '6: change foo baz'
+
+hg up -qC 5
+hg merge -qr 6
+echo >> bar
+hg ci -d '7 0' -m '7: second merge; change bar'
+
+echo >> foo
+hg ci -m '8: change foo'
+
+glog
+
+echo '% final file versions in this repo:'
+hg manifest --debug
+hg debugrename copied
+echo
+
+cd ..
+
+splitrepo()
+{
+    msg="$1"
+    files="$2"
+    opts=$3
+    echo "% $files: $msg"
+    prefix=`echo "$files" | sed -e 's/ /-/g'`
+    fmap="$prefix.fmap"
+    repo="$prefix.repo"
+    for i in $files; do
+	echo "include $i" >> "$fmap"
+    done
+    hg -q splitrepo $opts --filemap "$fmap" --datesort source "$repo"
+    glog -R "$repo"
+    hg -R "$repo" manifest --debug
+}
+
+splitrepo 'skip unwanted merges; use 1st parent in 1st merge, 2nd in 2nd' foo
+
+splitrepo 'merges are not merges anymore' bar
+
+splitrepo '1st merge is not a merge anymore; 2nd still is' baz
+
+splitrepo 'we add additional merges when they are interesting' 'foo quux'
+
+splitrepo 'partial conversion' 'bar quux' '-r 3'
+splitrepo 'complete the partial conversion' 'bar quux'
+
+splitrepo 'copied file; source not included in new repo' copied
+hg --cwd copied.repo debugrename copied
+
+splitrepo 'copied file; source included in new repo' 'foo copied'
+hg --cwd foo-copied.repo debugrename copied
+
+cat > renames.fmap <<EOF
+include dir
+exclude dir/file2
+rename dir dir2
+include foo
+include copied
+rename foo foo2
+rename copied copied2
+EOF
+hg -q splitrepo --filemap renames.fmap --datesort source renames.repo
+glog -R renames.repo
+hg -R renames.repo manifest --debug
+hg --cwd renames.repo debugrename copied2
+echo 'copied:'
+hg --cwd source cat copied
+echo 'copied2:'
+hg --cwd renames.repo cat copied2
diff -r 14b43c5dfd00 -r a9d7dd149cb2 tests/test-splitrepo.out
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-splitrepo.out	Sat Sep 01 03:48:34 2007 -0300
@@ -0,0 +1,140 @@
+@  8 "8: change foo" files: foo
+|
+o    7 "7: second merge; change bar" files: bar baz
+|\
+| o  6 "6: change foo baz" files: baz foo
+| |
+o |  5 "5: change bar baz quux" files: bar baz quux
+|/
+o    4 "4: first merge; change bar baz" files: bar baz
+|\
+| o  3 "3: change bar quux" files: bar quux
+| |
+o |  2 "2: change foo" files: foo
+|/
+o  1 "1: add bar quux; copy foo to copied" files: bar copied quux
+|
+o  0 "0: add foo baz dir/" files: baz dir/file dir/file2 foo
+
+% final file versions in this repo:
+9463f52fe115e377cf2878d4fc548117211063f2 644 bar
+94c1be4dfde2ee8d78db8bbfcf81210813307c3d 644 baz
+6ca237634e1f6bee1b6db94292fb44f092a25842 644 copied
+3e20847584beff41d7cd16136b7331ab3d754be0 644 dir/file
+75e6d3f8328f5f6ace6bf10b98df793416a09dca 644 dir/file2
+9a7b52012991e4873687192c3e17e61ba3e837a3 644 foo
+bc3eca3f47023a3e70ca0d8cc95a22a6827db19d 644 quux
+copied renamed from foo:2ed2a3912a0b24502043eae84ee4b279c18b90dd
+
+% foo: skip unwanted merges; use 1st parent in 1st merge, 2nd in 2nd
+o  3 "8: change foo" files: foo
+|
+o  2 "6: change foo baz" files: foo
+|
+o  1 "2: change foo" files: foo
+|
+o  0 "0: add foo baz dir/" files: foo
+
+9a7b52012991e4873687192c3e17e61ba3e837a3 644 foo
+% bar: merges are not merges anymore
+o  4 "7: second merge; change bar" files: bar
+|
+o  3 "5: change bar baz quux" files: bar
+|
+o  2 "4: first merge; change bar baz" files: bar
+|
+o  1 "3: change bar quux" files: bar
+|
+o  0 "1: add bar quux; copy foo to copied" files: bar
+
+9463f52fe115e377cf2878d4fc548117211063f2 644 bar
+% baz: 1st merge is not a merge anymore; 2nd still is
+o    4 "7: second merge; change bar" files: baz
+|\
+| o  3 "6: change foo baz" files: baz
+| |
+o |  2 "5: change bar baz quux" files: baz
+|/
+o  1 "4: first merge; change bar baz" files: baz
+|
+o  0 "0: add foo baz dir/" files: baz
+
+94c1be4dfde2ee8d78db8bbfcf81210813307c3d 644 baz
+% foo quux: we add additional merges when they are interesting
+o  8 "8: change foo" files: foo
+|
+o    7 "7: second merge; change bar" files:
+|\
+| o  6 "6: change foo baz" files: foo
+| |
+o |  5 "5: change bar baz quux" files: quux
+|/
+o    4 "4: first merge; change bar baz" files:
+|\
+| o  3 "3: change bar quux" files: quux
+| |
+o |  2 "2: change foo" files: foo
+|/
+o  1 "1: add bar quux; copy foo to copied" files: quux
+|
+o  0 "0: add foo baz dir/" files: foo
+
+9a7b52012991e4873687192c3e17e61ba3e837a3 644 foo
+bc3eca3f47023a3e70ca0d8cc95a22a6827db19d 644 quux
+% bar quux: partial conversion
+o  1 "3: change bar quux" files: bar quux
+|
+o  0 "1: add bar quux; copy foo to copied" files: bar quux
+
+b79105bedc55102f394e90a789c9c380117c1b4a 644 bar
+db0421cc6b685a458c8d86c7d5c004f94429ea23 644 quux
+% bar quux: complete the partial conversion
+o  4 "7: second merge; change bar" files: bar
+|
+o  3 "5: change bar baz quux" files: bar quux
+|
+o  2 "4: first merge; change bar baz" files: bar
+|
+o  1 "3: change bar quux" files: bar quux
+|
+o  0 "1: add bar quux; copy foo to copied" files: bar quux
+
+9463f52fe115e377cf2878d4fc548117211063f2 644 bar
+bc3eca3f47023a3e70ca0d8cc95a22a6827db19d 644 quux
+% copied: copied file; source not included in new repo
+o  0 "1: add bar quux; copy foo to copied" files: copied
+
+2ed2a3912a0b24502043eae84ee4b279c18b90dd 644 copied
+copied not renamed
+% foo copied: copied file; source included in new repo
+o  4 "8: change foo" files: foo
+|
+o  3 "6: change foo baz" files: foo
+|
+o  2 "2: change foo" files: foo
+|
+o  1 "1: add bar quux; copy foo to copied" files: copied
+|
+o  0 "0: add foo baz dir/" files: foo
+
+6ca237634e1f6bee1b6db94292fb44f092a25842 644 copied
+9a7b52012991e4873687192c3e17e61ba3e837a3 644 foo
+copied renamed from foo:2ed2a3912a0b24502043eae84ee4b279c18b90dd
+o  4 "8: change foo" files: foo2
+|
+o  3 "6: change foo baz" files: foo2
+|
+o  2 "2: change foo" files: foo2
+|
+o  1 "1: add bar quux; copy foo to copied" files: copied2
+|
+o  0 "0: add foo baz dir/" files: dir2/file foo2
+
+e5e3d520be9be45937d0b06b004fadcd6c221fa2 644 copied2
+3e20847584beff41d7cd16136b7331ab3d754be0 644 dir2/file
+9a7b52012991e4873687192c3e17e61ba3e837a3 644 foo2
+copied2 renamed from foo2:2ed2a3912a0b24502043eae84ee4b279c18b90dd
+copied:
+foo
+copied2:
+foo


More information about the Mercurial-devel mailing list