Splitting a repository

Alexis S. L. Carvalho alexis at cecm.usp.br
Sat May 19 21:12:44 CDT 2007

Thus spake Timo Sirainen:
> I just moved from CVS to Mercurial today, but soon I remembered one of
> my old plans: Splitting the project (Dovecot) to 3 separate
> repositories, so in the end I would have:
>  - "lib" containing original repository's src/lib/ and its history
>  - "auth" containing original repository's src/auth/ and some other
> directories, and their history
>  - The original repository with lib and auth code removed, but
> containing all the history
> So the "lib" and "auth" repositories really wouldn't need to have the
> extra 10MB of history for files that don't exist.
> If I had to do this now, I would a write a script that looked up
> changesets containing changes to files in wanted directories, exported
> the changesets, dropped out changes for files outside the wanted dirs,
> and imported the changesets to the new repository.
> Is there an easier way to do this? Or has someone already written a
> similar script?

Playing with this a bit I came up with the modified convert-repo that is

I haven't tested it much, but the resulting repos looked ok...

Right now it's hard-coded to create a repo with only the files from a
"mercurial/hgweb/" directory.  Look for a variable called "basedir" to
change this.

It moves all files from the given directory to the root of the repo.
To avoid this, change the "xformfile" function.

I wasn't sure what to do about tags, so the current version always adds
tagged revisions to the new repo, even if they don't touch any
interesting paths.  This is controlled by the "wanted" function.

I haven't tried to add any special handling of .hgtags, so depending on
your repo, you may end up with a .hgtags that points to hashes that
don't exist anymore (at worst this will generate some warnings from hg).
Maybe I should've just filtered it out.  Hmm...

Hope this helps.

-------------- next part --------------
#!/usr/bin/env python
# This is a generalized framework for converting between SCM
# repository formats.
# To use, run:
# convert-repo <source> [<dest> [<mapfile>]]
# Currently accepted source formats: git, cvs
# Currently accepted destination formats: hg
# If destination isn't given, a new Mercurial repo named <src>-hg will
# be created. If <mapfile> isn't given, it will be put in a default
# location (<dest>/.hg/shamap by default)
# The <mapfile> is a simple text file that maps each source commit ID to
# the destination ID for that revision, like so:
# <source ID> <destination ID>
# If the file doesn't exist, it's automatically created.  It's updated
# on each commit copied, so convert-repo can be interrupted and can
# be run repeatedly to copy new commits.

import sys, os, zlib, sha, time, re, locale, socket
os.environ["HGENCODING"] = "utf-8"
from mercurial import hg, ui, util, fancyopts

class Abort(Exception): pass
class NoRepo(Exception): pass

class commit:
    def __init__(self, **parts):
        for x in "author date desc parents".split():
            if not x in parts:
                abort("commit missing field %s\n" % x)

quiet = 0
def status(msg):
    if not quiet: sys.stdout.write(str(msg))

def warn(msg):

def abort(msg):
    raise Abort(msg)

def recode(s):
        return s.decode("utf-8").encode("utf-8")
            return s.decode("latin-1").encode("utf-8")
            return s.decode("utf-8", "replace").encode("utf-8")

# CVS conversion code inspired by hg-cvs-import and git-cvsimport
class convert_cvs:
    def __init__(self, path):
        self.path = path
        cvs = os.path.join(path, "CVS")
        if not os.path.exists(cvs):
            raise NoRepo("couldn't open CVS repo %s" % path)

        self.changeset = {}
        self.files = {}
        self.tags = {}
        self.lastbranch = {}
        self.parent = {}
        self.socket = None
        self.cvsroot = file(os.path.join(cvs, "Root")).read()[:-1]
        self.cvsrepo = file(os.path.join(cvs, "Repository")).read()[:-1]
        self.encoding = locale.getpreferredencoding()

    def _parse(self):
        if self.changeset:

        d = os.getcwd()
            id = None
            state = 0
            for l in os.popen("cvsps -A -u --cvs-direct -q"):
                if state == 0: # header
                    if l.startswith("PatchSet"):
                        id = l[9:-2]
                    elif l.startswith("Date"):
                        date = util.parsedate(l[6:-1], ["%Y/%m/%d %H:%M:%S"])
                        date = util.datestr(date)
                    elif l.startswith("Branch"):
                        branch = l[8:-1]
                        self.parent[id] = self.lastbranch.get(branch,'bad')
                        self.lastbranch[branch] = id
                    elif l.startswith("Ancestor branch"):
                        ancestor = l[17:-1]
                        self.parent[id] = self.lastbranch[ancestor]
                    elif l.startswith("Author"):
                        author = self.recode(l[8:-1])
                    elif l.startswith("Tag: "):
                        t = l[5:-1].rstrip()
                        if t != "(none)":
                            self.tags[t] = id
                    elif l.startswith("Log:"):
                        state = 1
                        log = ""
                elif state == 1: # log
                    if l == "Members: \n":
                        files = {}
                        log = self.recode(log[:-1])
                        if log.isspace():
                            log = "*** empty log message ***\n"
                        state = 2
                        log += l
                elif state == 2:
                    if l == "\n": #
                        state = 0
                        p = [self.parent[id]]
                        if id == "1":
                            p = []
                        c = commit(author=author, date=date, parents=p,
                                   desc=log, branch=branch)
                        self.changeset[id] = c
                        self.files[id] = files
                        file,rev = l[1:-2].rsplit(':',1)
                        rev = rev.split("->")[1]
                        files[file] = rev

            self.heads = self.lastbranch.values()

    def _connect(self):
        root = self.cvsroot
        conntype = None
        user, host = None, None
        cmd = ['cvs', 'server']

        status("connecting to %s\n" % root)

        if root.startswith(":pserver:"):
            root = root[9:]
            m = re.match(r'(?:(.*?)(?::(.*?))?@)?([^:\/]*)(?::(\d*))?(.*)', root)
            if m:
                conntype = "pserver"
                user, passw, serv, port, root = m.groups()
                if not user:
                    user = "anonymous"
                rr = ":pserver:" + user + "@" + serv + ":" +  root
                if port:
                    rr2, port = "-", int(port)
                    rr2, port = rr, 2401
                rr += str(port)

                if not passw:
                    passw = "A"
                    pf = open(os.path.join(os.environ["HOME"], ".cvspass"))
                    for l in pf:
                        # :pserver:cvs at mea.tmt.tele.fi:/cvsroot/zmailer Ah<Z
                        m = re.match(r'(/\d+\s+/)?(.*)', l)
                        l = m.group(2)
                        w, p = l.split(' ', 1)
                        if w in [rr, rr2]:
                            passw = p

                sck = socket.socket()
                sck.connect((serv, port))
                sck.send("\n".join(["BEGIN AUTH REQUEST", root, user, passw, "END AUTH REQUEST", ""]))
                if sck.recv(128) != "I LOVE YOU\n":
                    raise NoRepo("CVS pserver authentication failed")

                self.writep = self.readp = sck.makefile('r+')

        if not conntype and root.startswith(":local:"):
            conntype = "local"
            root = root[7:]

        if not conntype:
            # :ext:user at host/home/user/path/to/cvsroot
            if root.startswith(":ext:"):
                root = root[5:]
            m = re.match(r'(?:([^@:/]+)@)?([^:/]+):?(.*)', root)
            if not m:
                conntype = "local"
                conntype = "rsh"
                user, host, root = m.group(1), m.group(2), m.group(3)

        if conntype != "pserver":
            if conntype == "rsh": 
                rsh = os.environ.get("CVS_RSH" or "rsh")
                if user:
                    cmd = [rsh, '-l', user, host] + cmd
                    cmd = [rsh, host] + cmd

            self.writep, self.readp = os.popen2(cmd)

        self.realroot = root

        self.writep.write("Root %s\n" % root)
        self.writep.write("Valid-responses ok error Valid-requests Mode"
                          " M Mbinary E Checked-in Created Updated"
                          " Merged Removed\n")
        r = self.readp.readline()
        if not r.startswith("Valid-requests"):
            abort("server sucks\n")
        if "UseUnchanged" in r:
            r = self.readp.readline()

    def getheads(self):
        return self.heads

    def _getfile(self, name, rev):
        if rev.endswith("(DEAD)"):
            raise IOError

        args = ("-N -P -kk -r %s --" % rev).split()
        args.append(os.path.join(self.cvsrepo, name))
        for x in args:
            self.writep.write("Argument %s\n" % x)
        self.writep.write("Directory .\n%s\nco\n" % self.realroot)

        data = ""
        while 1:
            line = self.readp.readline()
            if line.startswith("Created ") or line.startswith("Updated "):
                self.readp.readline() # path
                self.readp.readline() # entries
                mode = self.readp.readline()[:-1]
                count = int(self.readp.readline()[:-1])
                data = self.readp.read(count)
            elif line.startswith(" "):
                data += line[1:]
            elif line.startswith("M "):
            elif line.startswith("Mbinary "):
                count = int(self.readp.readline()[:-1])
                data = self.readp.read(count)
                if line == "ok\n":
                    return (data, "x" in mode and "x" or "")
                elif line.startswith("E "):
                    warn("cvs server: %s\n" % line[2:])
                elif line.startswith("Remove"):
                    l = self.readp.readline()
                    l = self.readp.readline()
                    if l != "ok\n":
                        abort("unknown CVS response: %s\n" % l)
                    abort("unknown CVS response: %s\n" % line)

    def getfile(self, file, rev):
        data, mode = self._getfile(file, rev)
        self.modecache[(file, rev)] = mode
        return data

    def getmode(self, file, rev):
        return self.modecache[(file, rev)]

    def getchanges(self, rev):
        self.modecache = {}
        files = self.files[rev]
        cl = files.items()
        return cl

    def recode(self, text):
        return text.decode(self.encoding, "replace").encode("utf-8")

    def getcommit(self, rev):
        return self.changeset[rev]

    def gettags(self):
        return self.tags

class convert_git:
    def __init__(self, path):
        if os.path.isdir(path + "/.git"):
            path += "/.git"
        self.path = path
        if not os.path.exists(path + "/objects"):
            raise NoRepo("couldn't open GIT repo %s" % path)

    def getheads(self):
        fh = os.popen("GIT_DIR=%s git-rev-parse --verify HEAD" % self.path)
        return [fh.read()[:-1]]

    def catfile(self, rev, type):
        if rev == "0" * 40: raise IOError()
        fh = os.popen("GIT_DIR=%s git-cat-file %s %s 2>/dev/null" % (self.path, type, rev))
        return fh.read()

    def getfile(self, name, rev):
        return self.catfile(rev, "blob")

    def getmode(self, name, rev):
        return self.modecache[(name, rev)]

    def getchanges(self, version):
        self.modecache = {}
        fh = os.popen("GIT_DIR=%s git-diff-tree --root -m -r %s" % (self.path, version))
        changes = []
        for l in fh:
            if "\t" not in l: continue
            m, f = l[:-1].split("\t")
            m = m.split()
            h = m[3]
            p = (m[1] == "100755")
            s = (m[1] == "120000")
            self.modecache[(f, h)] = (p and "x") or (s and "l") or ""
            changes.append((f, h))
        return changes

    def getcommit(self, version):
        c = self.catfile(version, "commit") # read the commit hash
        end = c.find("\n\n")
        message = c[end+2:]
        message = recode(message)
        l = c[:end].splitlines()
        manifest = l[0].split()[1]
        parents = []
        for e in l[1:]:
            n,v = e.split(" ", 1)
            if n == "author":
                p = v.split()
                tm, tz = p[-2:]
                author = " ".join(p[:-2])
                if author[0] == "<": author = author[1:-1]
                author = recode(author)
            if n == "committer":
                p = v.split()
                tm, tz = p[-2:]
                committer = " ".join(p[:-2])
                if committer[0] == "<": committer = committer[1:-1]
                committer = recode(committer)
                message += "\ncommitter: %s\n" % committer
            if n == "parent": parents.append(v)

        tzs, tzh, tzm = tz[-5:-4] + "1", tz[-4:-2], tz[-2:]
        tz = -int(tzs) * (int(tzh) * 3600 + int(tzm))
        date = tm + " " + str(tz)

        c = commit(parents=parents, date=date, author=author, desc=message)
        return c

    def gettags(self):
        tags = {}
        fh = os.popen('git-ls-remote --tags "%s" 2>/dev/null' % self.path)
        prefix = 'refs/tags/'
        for line in fh:
            line = line.strip()
            if not line.endswith("^{}"):
            node, tag = line.split(None, 1)
            if not tag.startswith(prefix):
            tag = tag[len(prefix):-3]
            tags[tag] = node

        return tags

class convert_mercurial:
    def __init__(self, path):
        self.path = path
        u = ui.ui()
            self.repo = hg.repository(u, path)
            raise NoRepo("could open hg repo %s" % path)

    def mapfile(self):
        return os.path.join(self.path, ".hg", "shamap")

    def getheads(self):
        h = self.repo.changelog.heads()
        return [ hg.hex(x) for x in h ]

    def putfile(self, f, e, data):
        self.repo.wwrite(f, data, e)
        if self.repo.dirstate.state(f) == '?':
            self.repo.dirstate.update([f], "a")

    def delfile(self, f):

    def putcommit(self, files, parents, commit):
        seen = {}
        pl = []
        for p in parents:
            if p not in seen:
                seen[p] = 1
        parents = pl

        if len(parents) < 2: parents.append("0" * 40)
        if len(parents) < 2: parents.append("0" * 40)
        p2 = parents.pop(0)

        text = commit.desc
        if not text.strip():
            text = '<empty message>'
        author = commit.author
        if not author:
            author = 'Unknown'
        extra = {}
            extra["branch"] = commit.branch
        except AttributeError:

        while parents:
            p1 = p2
            p2 = parents.pop(0)
            a = self.repo.rawcommit(files, text, commit.author, commit.date,
                                    hg.bin(p1), hg.bin(p2), extra=extra)
            text = "(octopus merge fixup)\n"
            p2 = hg.hex(self.repo.changelog.tip())

        return p2

    def puttags(self, tags):
            old = self.repo.wfile(".hgtags").read()
            oldlines = old.splitlines(1)
            oldlines = []

        k = tags.keys()
        newlines = []
        for tag in k:
            newlines.append("%s %s\n" % (tags[tag], tag))


        if newlines != oldlines:
            status("updating tags\n")
            f = self.repo.wfile(".hgtags", "w")
            if not oldlines: self.repo.add([".hgtags"])
            date = "%s 0" % int(time.mktime(time.gmtime()))
            self.repo.rawcommit([".hgtags"], "update tags", "convert-repo",
                                date, self.repo.changelog.tip(), hg.nullid)
            return hg.hex(self.repo.changelog.tip())

class convert_mercurial_in:
    def __init__(self, path):
        self.path = path
        u = ui.ui()
            self.repo = hg.repository(u, path)
            raise NoRepo("could open hg repo %s" % path)

    # We want just changesets that touch this directory
    basedir = 'mercurial/hgweb/'
    if basedir and not basedir.endswith('/'):
        basedir += '/'
    def xformfile(self, f):
        "Strip the basedir from the filename"
        if f.startswith(self.basedir):
            return f[len(self.basedir):]

    def wantedfile(self, f):
        "Returns True if we're interested in f"
        return f.startswith(self.basedir)

    def wanted(self, ctx):
        "Returns True if we're interested in the revision represented by ctx"
        tags = self.repo.nodetags(ctx.node())
        if tags and ('tip' not in tags or len(tags) > 1):
            return True
        for f in ctx.files():
            if self.wantedfile(f):
                return True
        return False

    def getheads(self):
        # We can't just use repo.heads() because we may not be interested
        # in the real heads.  Since we'll have to eventually walk the whole
        # graph to figure out what changesets we're interested in, we might
        # as well do it now.  Notice that merges require some extra care.

        # The final parents of the changesets in the reduced graph.
        cachedparents = {}

        # Helper. If a child (in the full DAG) of a node n is in the reduced
        # graph, its new parent should be parentmap[n].
        parentmap = {}

        # wantedancestors[n] is the set of all ancestors of n that are in the
        # reduced graph.  This is used in the merge handling.  We try to avoid
        # making 1e6 different copies of these sets.
        wantedancestors = {}

        heads = {}
        cl = self.repo.changelog

        def mark_wanted(r, parents):
            # r is an interesting changeset.
            cachedparents[r] = [parentmap[p] for p in parents if parentmap[p] is not None]
            parentmap[r] = r
            if parents:
                wa = wantedancestors[parents[0]].copy()
                if len(parents) == 2:
                wa[r] = 1
                wantedancestors[r] = wa
                wantedancestors[r] = {r: 1}

            heads[r] = 1
            for p in cachedparents[r]:
                if p in heads:
                    del heads[p]

        for r in xrange(cl.count()):
            ctx = self.repo.changectx(r)
            parents = [p for p in cl.parentrevs(r) if p != hg.nullrev]
            if self.wanted(ctx):
                mark_wanted(r, parents)
            elif (len(parents) == 2 and
                  (parentmap[parents[0]] not in wantedancestors[parents[1]] and
                   parentmap[parents[1]] not in wantedancestors[parents[0]])):
                # Even though we're not directly interested in this
                # revision, it merges two revisions we're interested
                # in.
                mark_wanted(r, parents)
            elif parents:
                # We're not interested in this revision. Propagate the data
                # from our parent(s).
                p = parents[0]
                if len(parents) == 2:
                    if parentmap[p] in wantedancestors[parents[1]]:
                        p = parents[1]
                parentmap[r] = parentmap[p]
                wantedancestors[r] = wantedancestors[p]
                parentmap[r] = None
                wantedancestors[r] = {None: 1}

        self.cachedparents = cachedparents
        heads = heads.keys()
        return [ hg.hex(cl.node(r)) for r in heads ]

    def getcommit(self, version):
        r = self.repo
        cl = r.changelog
        ctx = r.changectx(version)
        rev = cl.rev(hg.bin(version))
        parents = [hg.hex(cl.node(p)) for p in self.cachedparents[cl.rev(hg.bin(version))]]
        c = commit(author=ctx.user(), date='%d %d' % ctx.date(),
                   desc=ctx.description(), parents=parents,
        return c

    def getchanges(self, version):
        files = {}
        ctx = self.repo.changectx(version)
        parents = [p for p in self.repo.changelog.parents(hg.bin(version))
                   if p != hg.nullid]
        if not parents:
            parents = [hg.nullid]
        for p in parents:
            modified, added, removed = self.repo.status(p, hg.bin(version))[:3]
            # Return only changes in files we're interested in.
            # We transform the filename here (e.g. to move directories around),
            # but we keep a reference to the original filename to be able to
            # get the file contents later on.
            for f in modified:
                if self.wantedfile(f):
                    files[self.xformfile(f)] = (f, 1)
            for f in added:
                if self.wantedfile(f):
                    files[self.xformfile(f)] = (f, 1)
            for f in removed:
                if self.wantedfile(f):
                    files[self.xformfile(f)] = (f, None)
        man = ctx.manifest()
        self.modecache = {}
        for xf in files:
            f, exists = files[xf]
            if exists:
                fn = man[f]
                files[xf] = (f, fn)
                self.modecache[(f, fn)] = man.flags(f)
        changes = files.items()
        return changes

    def getfile(self, name, rev):
        realname, rev = rev
        if rev is None:
            raise IOError()
        fl = self.repo.file(realname)
        return fl.read(rev)

    def getmode(self, name, rev):
        return self.modecache[rev]

    def gettags(self):
        tags = {}
        repotags = self.repo.tags()
        for t, node in repotags.items():
            if t == 'tip':
            tags[t] = hg.hex(node)
        return tags

converters = [convert_cvs, convert_git, convert_mercurial_in, convert_mercurial]

def converter(path, out):
    if not os.path.isdir(path):
        abort("%s: not a directory\n" % path)
    for c in converters:
            converter = c(path)
            if out:
                if not hasattr(converter, "putcommit"):
                if not hasattr(converter, "getcommit"):
            return converter
        except NoRepo:
    abort("%s: unknown repository type\n" % path)

class convert:
    def __init__(self, source, dest, mapfile, opts):

        self.source = source
        self.dest = dest
        self.mapfile = mapfile
        self.opts = opts
        self.commitcache = {}

        self.map = {}
            for l in file(self.mapfile):
                sv, dv = l[:-1].split()
                self.map[sv] = dv
        except IOError:

    def walktree(self, heads):
        visit = heads
        known = {}
        parents = {}
        while visit:
            n = visit.pop(0)
            if n in known or n in self.map: continue
            known[n] = 1
            self.commitcache[n] = self.source.getcommit(n)
            cp = self.commitcache[n].parents
            for p in cp:
                parents.setdefault(n, []).append(p)

        return parents

    def toposort(self, parents):
        visit = parents.keys()
        seen = {}
        children = {}

        while visit:
            n = visit.pop(0)
            if n in seen: continue
            seen[n] = 1
            pc = 0
            if n in parents:
                for p in parents[n]:
                    if p not in self.map: pc += 1
                    children.setdefault(p, []).append(n)
            if not pc: root = n

        s = []
        removed = {}
        visit = children.keys()
        while visit:
            n = visit.pop(0)
            if n in removed: continue
            dep = 0
            if n in parents:
                for p in parents[n]:
                    if p in self.map: continue
                    if p not in removed:
                        # we're still dependent
                        dep = 1

            if not dep:
                # all n's parents are in the list
                removed[n] = 1
                if n not in self.map:
                if n in children:
                    for c in children[n]:
                        visit.insert(0, c)

        if opts.get('datesort'):
            depth = {}
            for n in s:
                depth[n] = 0
                pl = [p for p in self.commitcache[n].parents if p not in self.map]
                if pl:
                    depth[n] = max([depth[p] for p in pl]) + 1

            s = [(depth[n], self.commitcache[n].date, n) for n in s]
            s = [e[2] for e in s]

        return s

    def copy(self, rev):
        c = self.commitcache[rev]
        files = self.source.getchanges(rev)

        for f,v in files:
                data = self.source.getfile(f, v)
            except IOError, inst:
                e = self.source.getmode(f, v)
                self.dest.putfile(f, e, data)

        r = [self.map[v] for v in c.parents]
        f = [f for f,v in files]
        self.map[rev] = self.dest.putcommit(f, r, c)
        file(self.mapfile, "a").write("%s %s\n" % (rev, self.map[rev]))

    def convert(self):
        status("scanning source...\n")
        heads = self.source.getheads()
        parents = self.walktree(heads)
        t = self.toposort(parents)
        num = len(t)
        c = None

        for c in t:
            num -= 1
            desc = self.commitcache[c].desc
            if "\n" in desc:
                desc = desc.splitlines()[0]
            status("%d %s\n" % (num, desc))

        tags = self.source.gettags()
        ctags = {}
        for k in tags:
            v = tags[k]
            if v in self.map:
                ctags[k] = self.map[v]

        if c and ctags:
            nrev = self.dest.puttags(ctags)
            # write another hash correspondence to override the previous
            # one so we don't end up with extra tag heads
            if nrev:
                file(self.mapfile, "a").write("%s %s\n" % (c, nrev))

def command(src, dest=None, mapfile=None, **opts):
    srcc = converter(src, 0)
    if not hasattr(srcc, "getcommit"):
        abort("%s: can't read from this repo type\n" % src)

    if not dest:
        dest = src + "-hg"
        status("assuming destination %s\n" % dest)
        if not os.path.isdir(dest):
            status("creating repository %s\n" % dest)
            os.system("hg init " + dest)
    destc = converter(dest, 1)
    if not hasattr(destc, "putcommit"):
        abort("%s: can't write to this repo type\n" % src)

    if not mapfile:
            mapfile = destc.mapfile()
            mapfile = os.path.join(destc, "map")

    c = convert(srcc, destc, mapfile, opts)

options = [('q', 'quiet', None, 'suppress output'),
           ('', 'datesort', None, 'try to sort changesets by date')]
opts = {}
args = fancyopts.fancyopts(sys.argv[1:], options, opts)

if opts['quiet']:
    quiet = 1

    command(*args, **opts)
except Abort, inst:
except KeyboardInterrupt:

More information about the Mercurial mailing list