D6734: git: RFC of a new extension to _directly_ operate on git repositories

durin42 (Augie Fackler) phabricator at mercurial-scm.org
Thu Sep 12 17:52:26 EDT 2019


durin42 edited the summary of this revision.
durin42 updated this revision to Diff 16525.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D6734?vs=16387&id=16525

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D6734/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D6734

AFFECTED FILES
  hgext/git/__init__.py
  hgext/git/dirstate.py
  hgext/git/gitlog.py
  hgext/git/index.py
  mercurial/transaction.py
  setup.py
  tests/test-check-interfaces.py
  tests/test-git-interop.t

CHANGE DETAILS

diff --git a/tests/test-git-interop.t b/tests/test-git-interop.t
new file mode 100644
--- /dev/null
+++ b/tests/test-git-interop.t
@@ -0,0 +1,181 @@
+This test requires pygit2:
+  > python -c 'import pygit2' || exit 80
+
+Setup:
+  > GIT_AUTHOR_NAME='test'; export GIT_AUTHOR_NAME
+  > GIT_AUTHOR_EMAIL='test at example.org'; export GIT_AUTHOR_EMAIL
+  > GIT_AUTHOR_DATE="2007-01-01 00:00:00 +0000"; export GIT_AUTHOR_DATE
+  > GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME"; export GIT_COMMITTER_NAME
+  > GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL"; export GIT_COMMITTER_EMAIL
+  > GIT_COMMITTER_DATE="$GIT_AUTHOR_DATE"; export GIT_COMMITTER_DATE
+
+  > count=10
+  > gitcommit() {
+  >    GIT_AUTHOR_DATE="2007-01-01 00:00:$count +0000";
+  >    GIT_COMMITTER_DATE="$GIT_AUTHOR_DATE"
+  >    git commit "$@" >/dev/null 2>/dev/null || echo "git commit error"
+  >    count=`expr $count + 1`
+  >  }
+
+  > echo "[extensions]" >> $HGRCPATH
+  > echo "git=" >> $HGRCPATH
+
+Make a new repo with git:
+  $ mkdir foo
+  $ cd foo
+  $ git init
+  Initialized empty Git repository in $TESTTMP/foo/.git/
+Ignore the .hg directory within git:
+  $ echo .hg >> .git/info/exclude
+  $ echo alpha > alpha
+  $ git add alpha
+  $ gitcommit -am 'Add alpha'
+  $ echo beta > beta
+  $ git add beta
+  $ gitcommit -am 'Add beta'
+  $ echo gamma > gamma
+  $ git status
+  On branch master
+  Untracked files:
+    (use "git add <file>..." to include in what will be committed)
+  
+  	gamma
+  
+  nothing added to commit but untracked files present (use "git add" to track)
+
+Without creating the .hg, hg status fails:
+  $ hg status
+  abort: no repository found in '$TESTTMP/foo' (.hg not found)!
+  [255]
+But if you run hg init --git, it works:
+  $ hg init --git
+  $ hg id
+  3d9be8deba43
+  $ hg status
+  ? gamma
+Log works too:
+  $ hg log
+  changeset:   1:3d9be8deba43
+  bookmark:    master
+  user:        test <test at example.org>
+  date:        Mon Jan 01 00:00:11 2007 +0000
+  summary:     Add beta
+  
+  changeset:   0:c5864c9d16fb
+  user:        test <test at example.org>
+  date:        Mon Jan 01 00:00:10 2007 +0000
+  summary:     Add alpha
+  
+
+
+and bookmarks:
+  $ hg bookmarks
+   * master                    1:3d9be8deba43
+
+diff even works transparently in both systems:
+  $ echo blah >> alpha
+  $ git diff
+  diff --git a/alpha b/alpha
+  index 4a58007..faed1b7 100644
+  --- a/alpha
+  +++ b/alpha
+  @@ -1 +1,2 @@
+   alpha
+  +blah
+  $ hg diff --git
+  diff --git a/alpha b/alpha
+  --- a/alpha
+  +++ b/alpha
+  @@ -1,1 +1,2 @@
+   alpha
+  +blah
+
+Remove a file, it shows as such:
+  $ rm alpha
+  $ hg status
+  ! alpha
+  ? gamma
+
+Revert works:
+  $ hg revert alpha --traceback
+  $ hg status
+  ? gamma
+  $ git status
+  On branch master
+  Untracked files:
+    (use "git add <file>..." to include in what will be committed)
+  
+  	gamma
+  
+  nothing added to commit but untracked files present (use "git add" to track)
+
+Add shows sanely in both:
+  $ hg add gamma
+  $ hg status
+  A gamma
+  $ git status
+  On branch master
+  Changes to be committed:
+    (use "git reset HEAD <file>..." to unstage)
+  
+  	new file:   gamma
+  
+
+forget does what it should as well:
+  $ hg forget gamma
+  $ hg status
+  ? gamma
+  $ git status
+  On branch master
+  Untracked files:
+    (use "git add <file>..." to include in what will be committed)
+  
+  	gamma
+  
+  nothing added to commit but untracked files present (use "git add" to track)
+
+hg log FILE
+
+  $ echo a >> alpha
+  $ hg ci -m 'more alpha'
+  $ echo b >> beta
+  $ hg ci -m 'more beta'
+  $ echo a >> alpha
+  $ hg ci -m 'even more alpha'
+  $ hg log -G alpha
+  @  changeset:   4:3c23e80807e9
+  :  user:        test <test>
+  :  date:        Thu Sep 12 21:49:03 2019 +0000
+  :  summary:     even more alpha
+  :
+  o  changeset:   2:3c9fdd548b09
+  :  user:        test <test>
+  :  date:        Thu Sep 12 21:49:02 2019 +0000
+  :  summary:     more alpha
+  :
+  o  changeset:   0:c5864c9d16fb
+     user:        test <test at example.org>
+     date:        Mon Jan 01 00:00:10 2007 +0000
+     summary:     Add alpha
+  
+  $ hg log -G beta
+  o  changeset:   3:6f0d7ea2092c
+  :  user:        test <test>
+  :  date:        Thu Sep 12 21:49:03 2019 +0000
+  :  summary:     more beta
+  :
+  o  changeset:   1:3d9be8deba43
+  |  user:        test <test at example.org>
+  ~  date:        Mon Jan 01 00:00:11 2007 +0000
+     summary:     Add beta
+  
+
+hg annotate
+
+  $ hg annotate alpha
+  0: alpha
+  2: a
+  4: a
+  $ hg annotate beta
+  1: beta
+  3: b
diff --git a/tests/test-check-interfaces.py b/tests/test-check-interfaces.py
--- a/tests/test-check-interfaces.py
+++ b/tests/test-check-interfaces.py
@@ -44,6 +44,11 @@
     wireprotov2server,
 )
 
+from hgext.git import (
+    dirstate as gitdirstate,
+    gitlog,
+)
+
 testdir = os.path.dirname(__file__)
 rootdir = pycompat.fsencode(os.path.normpath(os.path.join(testdir, '..')))
 
@@ -193,6 +198,10 @@
 
     ziverify.verifyClass(intdirstate.idirstate, dirstate.dirstate)
 
+    # git interop implementations
+    ziverify.verifyClass(intdirstate.idirstate, gitdirstate.gitdirstate)
+    ziverify.verifyClass(repository.ifilestorage, gitlog.filelog)
+
     vfs = vfsmod.vfs(b'.')
     fl = filelog.filelog(vfs, b'dummy.i')
     checkzobject(fl, allowextra=True)
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -1079,6 +1079,7 @@
             'hgext', 'hgext.convert', 'hgext.fsmonitor',
             'hgext.fastannotate',
             'hgext.fsmonitor.pywatchman',
+            'hgext.git',
             'hgext.highlight',
             'hgext.infinitepush',
             'hgext.largefiles', 'hgext.lfs', 'hgext.narrow',
diff --git a/mercurial/transaction.py b/mercurial/transaction.py
--- a/mercurial/transaction.py
+++ b/mercurial/transaction.py
@@ -467,8 +467,11 @@
             self._generatefiles(group=gengroupprefinalize)
             categories = sorted(self._finalizecallback)
             for cat in categories:
-                self._finalizecallback[cat](self)
-            # Prevent double usage and help clear cycles.
+                try:
+                    self._finalizecallback[cat](self)
+                except TypeError as e:
+                    raise TypeError('%r: %r (%r)' % (cat, self._finalizecallback[cat], e))
+                # Prevent double usage and help clear cycles.
             self._finalizecallback = None
             self._generatefiles(group=gengrouppostfinalize)
 
diff --git a/hgext/git/index.py b/hgext/git/index.py
new file mode 100644
--- /dev/null
+++ b/hgext/git/index.py
@@ -0,0 +1,172 @@
+from __future__ import absolute_import
+
+import os
+import sqlite3
+
+from mercurial.i18n import _
+
+from mercurial import (
+    encoding,
+    error,
+    node as nodemod,
+)
+
+import pygit2
+
+_CURRENT_SCHEMA_VERSION = 1
+_SCHEMA = """
+CREATE TABLE refs (
+  -- node and name are unique together. There may be more than one name for
+  -- a given node, and there may be no name at all for a given node (in the
+  -- case of an anonymous hg head).
+  node TEXT NOT NULL,
+  name TEXT
+);
+
+-- The topological heads of the changelog, which hg depends on.
+CREATE TABLE heads (
+  node TEXT NOT NULL
+);
+
+-- A total ordering of the changelog
+CREATE TABLE changelog (
+  rev INTEGER NOT NULL PRIMARY KEY,
+  node TEXT NOT NULL,
+  p1 TEXT,
+  p2 TEXT
+);
+
+CREATE UNIQUE INDEX changelog_node_idx ON changelog(node);
+CREATE UNIQUE INDEX changelog_node_rev_idx ON changelog(rev, node);
+
+-- Changed files for each commit, which lets us dynamically build
+-- filelogs.
+CREATE TABLE changedfiles (
+  node TEXT NOT NULL,
+  filename TEXT NOT NULL,
+  -- 40 zeroes for deletions
+  filenode TEXT NOT NULL
+);
+
+CREATE INDEX changedfiles_nodes_idx
+  ON changedfiles(node);
+
+PRAGMA user_version=%d
+""" % _CURRENT_SCHEMA_VERSION
+
+def _createdb(path):
+    # print('open db', path)
+    # import traceback
+    # traceback.print_stack()
+    db = sqlite3.connect(encoding.strfromlocal(path))
+    db.text_factory = bytes
+
+    res = db.execute(r'PRAGMA user_version').fetchone()[0]
+
+    # New database.
+    if res == 0:
+        for statement in _SCHEMA.split(';'):
+            db.execute(statement.strip())
+
+        db.commit()
+
+    elif res == _CURRENT_SCHEMA_VERSION:
+        pass
+
+    else:
+        raise error.Abort(_('sqlite database has unrecognized version'))
+
+    db.execute(r'PRAGMA journal_mode=WAL')
+
+    return db
+
+_OUR_ORDER = (pygit2.GIT_SORT_TOPOLOGICAL |
+              pygit2.GIT_SORT_TIME |
+              pygit2.GIT_SORT_REVERSE)
+
+_DIFF_FLAGS = 1 << 21  # GIT_DIFF_FORCE_BINARY, which isn't exposed by pygit2
+
+def _index_repo(gitrepo, db, progress_cb):
+    # Identify all references so we can tell the walker to visit all of them.
+    all_refs = gitrepo.listall_references()
+    walker = None
+    possible_heads = set()
+    for pos, ref in enumerate(all_refs):
+        progress_cb('refs', pos)
+        try:
+            start = gitrepo.lookup_reference(ref).peel(pygit2.GIT_OBJ_COMMIT)
+        except ValueError:
+            # No commit to be found, so we don't care for hg's purposes.
+            continue
+        possible_heads.add(start.id.hex)
+        if walker is None:
+            walker = gitrepo.walk(start.id, _OUR_ORDER)
+        else:
+            walker.push(start.id)
+    # Empty out the existing changelog. Even for large-ish histories
+    # we can do the top-level "walk all the commits" dance very
+    # quickly as long as we don't need to figure out the changed files
+    # list.
+    db.execute('DELETE FROM changelog')
+    progress_cb('refs', None)
+    # This walker is sure to visit all the revisions in history, but
+    # only once.
+    for pos, commit in enumerate(walker):
+        progress_cb('commits', pos)
+        r = commit.id.raw
+        p1 = p2 = nodemod.nullhex
+        if len(commit.parents) > 2:
+            raise error.ProgrammingError(
+                ("git support can't handle octopus merges, "
+                 "found a commit with %d parents :(") % len(commit.parents))
+        if commit.parents:
+            p1 = commit.parents[0].id.hex
+        if len(commit.parents) == 2:
+            p2 = commit.parents[1].id.hex
+        db.execute(
+            'INSERT INTO changelog (rev, node, p1, p2) VALUES(?, ?, ?, ?)',
+            (pos, commit.id.hex, p1, p2))
+
+        num_changedfiles = db.execute(
+            "SELECT COUNT(*) from changedfiles WHERE node = ?",
+            (commit.id.hex,)).fetchone()[0]
+        if not num_changedfiles:
+            files = {}
+            # I *think* we only need to check p1 for changed files
+            # (and therefore linkrevs), because any node that would
+            # actually have this commit as a linkrev would be
+            # completely new in this rev.
+            p1 = commit.parents[0].id.hex if commit.parents else None
+            if p1 is not None:
+                patchgen = gitrepo.diff(p1, commit.id.hex, flags=_DIFF_FLAGS)
+            else:
+                patchgen = commit.tree.diff_to_tree(
+                    swap=True, flags=_DIFF_FLAGS)
+            new_files = (p.delta.new_file for p in patchgen)
+            files = {nf.path: nf.id.hex for nf in new_files
+                     if nf.id.raw != nodemod.nullid}
+            for p, n in files.items():
+                db.execute(
+                    'INSERT INTO changedfiles (node, filename, filenode) '
+                    'VALUES(?, ?, ?)',
+                    (commit.id.hex, p, n))
+    db.execute('DELETE FROM heads')
+    for h in possible_heads:
+        haschild = db.execute(
+            'SELECT COUNT(*) FROM changelog WHERE p1 = ? OR p2 = ?',
+            (h, h)).fetchone()[0]
+        if not haschild:
+            db.execute('INSERT INTO heads (node) VALUES(?)', (h,))
+
+    progress_cb('commits', None)
+
+def get_index(gitrepo):
+    cachepath = os.path.join(gitrepo.path, '..', '.hg', 'cache')
+    if not os.path.exists(cachepath):
+        os.makedirs(cachepath)
+    dbpath = os.path.join(cachepath, 'git-commits.sqlite')
+    db = _createdb(dbpath)
+    # TODO check against gitrepo heads before doing a full index
+    # TODO thread a ui.progress call into this layer
+    _index_repo(gitrepo, db, lambda x, y: None)
+    return db
diff --git a/hgext/git/gitlog.py b/hgext/git/gitlog.py
new file mode 100644
--- /dev/null
+++ b/hgext/git/gitlog.py
@@ -0,0 +1,376 @@
+from __future__ import absolute_import
+
+from mercurial.i18n import _
+
+from mercurial import (
+    ancestor,
+    changelog as hgchangelog,
+    dagop,
+    error,
+    manifest,
+    match as matchmod,
+    node as nodemod,
+    pycompat,
+    revlog,
+)
+from mercurial.interfaces import (
+    repository,
+    util as interfaceutil,
+)
+from mercurial.utils import (
+    stringutil,
+)
+from . import (
+    index,
+)
+
+import pygit2
+
+class baselog(object): # revlog.revlog):
+    """Common implementations between changelog and manifestlog."""
+    def __init__(self, gr, db):
+        self.gitrepo = gr
+        self._db = db
+
+    def __len__(self):
+        return int(self._db.execute(
+            'SELECT COUNT(*) FROM changelog').fetchone()[0])
+
+    def rev(self, n):
+        if n == nodemod.nullid:
+            return -1
+        t = self._db.execute(
+            'SELECT rev FROM changelog WHERE node = ?',
+            (nodemod.hex(n),)).fetchone()
+        if t is None:
+            raise error.LookupError(n, '00changelog.i', _('no node'))
+        return t[0]
+
+    def node(self, r):
+        if r == nodemod.nullrev:
+            return nodemod.nullid
+        t = self._db.execute(
+            'SELECT node FROM changelog WHERE rev = ?',
+            (r,)).fetchone()
+        if t is None:
+            raise error.LookupError(r, '00changelog.i', _('no node'))
+        return nodemod.bin(t[0])
+
+    def hasnode(self, n):
+        t = self._db.execute(
+            'SELECT node FROM changelog WHERE node = ?',
+            (n,)).fetchone()
+        return t is not None
+
+# TODO: an interface for the changelog type?
+class changelog(baselog):
+
+    def __contains__(self, rev):
+        try:
+            self.node(rev)
+            return True
+        except error.LookupError:
+            return False
+
+    @property
+    def filteredrevs(self):
+        # TODO: we should probably add a refs/hg/ namespace for hidden
+        # heads etc, but that's an idea for later.
+        return set()
+
+    @property
+    def nodemap(self):
+        r = {
+            nodemod.bin(v[0]): v[1] for v in
+            self._db.execute('SELECT node, rev FROM changelog')}
+        r[nodemod.nullid] = nodemod.nullrev
+        return r
+
+    def tip(self):
+        t = self._db.execute(
+            'SELECT node FROM changelog ORDER BY rev DESC LIMIT 1').fetchone()
+        if t:
+            return nodemod.hex(t[0])
+        return nodemod.nullid
+
+    def revs(self, start=0, stop=None):
+        if stop is None:
+            stop = self.tip()
+        t = self._db.execute(
+            'SELECT rev FROM changelog '
+            'WHERE rev >= ? AND rev <= ? '
+            'ORDER BY REV ASC',
+            (start, stop))
+        return (int(r[0]) for r in t)
+
+    def headrevs(self, revs=None):
+        realheads =  [int(x[0]) for x in
+                      self._db.execute(
+                          'SELECT rev FROM changelog '
+                          'INNER JOIN heads ON changelog.node = heads.node')]
+        if revs:
+            return sorted([r for r in revs if r in realheads])
+        return sorted(realheads)
+
+    def changelogrevision(self, nodeorrev):
+        # Ensure we have a node id
+        if isinstance(nodeorrev, int):
+            n = self.node(nodeorrev)
+        else:
+            n = nodeorrev
+        # handle looking up nullid
+        if n == nodemod.nullid:
+            return hgchangelog._changelogrevision(extra={})
+        hn = nodemod.hex(n)
+        # We've got a real commit!
+        files = [r[0] for r in self._db.execute(
+            'SELECT filename FROM changedfiles '
+            'WHERE node = ? and filenode != ?',
+            (hn, nodemod.nullhex))]
+        filesremoved = [r[0] for r in self._db.execute(
+            'SELECT filename FROM changedfiles '
+            'WHERE node = ? and filenode = ?',
+            (hn, nodemod.nullhex))]
+        c = self.gitrepo[hn]
+        return hgchangelog._changelogrevision(
+            manifest=n, # pretend manifest the same as the commit node
+            user='%s <%s>' % (c.author.name, c.author.email),
+            # TODO: a fuzzy memory from hg-git hacking says this should be -offset
+            date=(c.author.time, c.author.offset),
+            files=files,
+            # TODO filesadded in the index
+            filesremoved=filesremoved,
+            description=c.message,
+            # TODO do we want to handle extra? how?
+            extra={b'branch': b'default'},
+        )
+
+    def ancestors(self, revs, stoprev=0, inclusive=False):
+        revs = list(revs)
+        tip = self.tip()
+        for r in revs:
+            if r > tip:
+                raise IndexError('Invalid rev %r' % r)
+        return ancestor.lazyancestors(
+            self.parentrevs, revs, stoprev=stoprev, inclusive=inclusive)
+
+    # Cleanup opportunity: this is *identical* to the revlog.py version
+    def descendants(self, revs):
+        return dagop.descendantrevs(revs, self.revs, self.parentrevs)
+
+    def reachableroots(self, minroot, heads, roots, includepath=False):
+        return dagop._reachablerootspure(self.parentrevs,
+                                         minroot, roots, heads, includepath)
+
+    def parentrevs(self, rev):
+        n = self.node(rev)
+        hn = nodemod.hex(n)
+        c = self.gitrepo[hn]
+        p1 = p2 = nodemod.nullrev
+        if c.parents:
+            p1 = self.rev(c.parents[0].id.raw)
+            if len(c.parents) > 2:
+                raise error.Abort('TODO octopus merge handling')
+            if len(c.parents) == 2:
+                p2 = self.rev(c.parents[0].id.raw)
+        return p1, p2
+
+    # Private method is used at least by the tags code.
+    _uncheckedparentrevs = parentrevs
+
+    def commonancestorsheads(self, a, b):
+        # TODO the revlog verson of this has a C path, so we probably
+        # need to optimize this...
+        a, b = self.rev(a), self.rev(b)
+        return [self.node(n) for n in
+                ancestor.commonancestorsheads(self.parentrevs, a, b)]
+
+    def branchinfo(self, rev):
+        """Git doesn't do named branches, so just put everything on default."""
+        return b'default', False
+
+    def delayupdate(self, tr):
+        # TODO: I think we can elide this because we're just dropping
+        # an object in the git repo?
+        pass
+
+    def add(self, manifest, files, desc, transaction, p1, p2,
+            user, date=None, extra=None, p1copies=None, p2copies=None,
+            filesadded=None, filesremoved=None):
+        parents = []
+        hp1, hp2 = nodemod.hex(p1), nodemod.hex(p2)
+        if p1 != nodemod.nullid:
+            parents.append(hp1)
+        if p2 and p2 != nodemod.nullid:
+            parents.append(hp2)
+        sig = pygit2.Signature(stringutil.person(user), stringutil.email(user))
+        oid = self.gitrepo.create_commit(
+            None, sig, sig, desc,
+            nodemod.hex(manifest), parents)
+        # Set up an internal reference to force the commit into the
+        # changelog. Hypothetically, we could even use this refs/hg/
+        # namespace to allow for anonymous heads on git repos, which
+        # would be neat.
+        self.gitrepo.references.create(
+            'refs/hg/internal/latest-commit', oid, force=True)
+        # Reindex now to pick up changes
+        index._index_repo(self.gitrepo, self._db, lambda x, y: None)
+        return oid.raw
+
+# TODO: Make a split between mutable and immutable manifest types here.
+class gittreemanifest(object):
+    def __init__(self, gt, builderfn):
+        self._builderfn = builderfn
+        self._tree = gt
+        self._builder = None
+
+    def __contains__(self, k):
+        if self._builder:
+            return self._builder.get(k) is not None
+        return k in self._tree
+
+    def __getitem__(self, k):
+        if self._builder:
+            match = self._builder.get(k)
+            if match is None:
+                raise error.LookupError('File %r not found in tree %r' % (
+                    k, self._tree.id.hex))
+            return match
+        try:
+            return self._tree[k].id.raw
+        except ValueError:
+            raise error.LookupError('File %r not found in tree %r' % (
+                k, self._tree.id.hex))
+
+    def __setitem__(self, k, v):
+        if self._builder is None:
+            self._builder = self._builderfn()
+        self._builder.insert(k, nodemod.hex(v), pygit2.GIT_FILEMODE_BLOB)
+
+    def setflag(self, p, flag):
+        oid = self._builder.get(p).id
+        if not flag:
+            self._builder.insert(p, oid, pygit2.GIT_FILEMODE_BLOB)
+        elif flag == 'x':
+            self._builder.insert(p, oid, pygit2.GIT_FILEMODE_BLOB_EXECUTABLE)
+        elif flag == 'l':
+            self._builder.insert(p, oid, pygit2.GIT_FILEMODE_LINK)
+        else:
+            raise ValueError('Illegal flag value %r on path %r' % flag, p)
+
+    def flags(self, k):
+        # TODO flags handling
+        return ''
+
+    def walk(self, match):
+        for f in self._tree:
+            # TODO recurse into subtrees...
+            yield f.name
+
+    def get(self, fname, default=None):
+        if fname in self:
+            return self[fname]
+        return default
+
+ at interfaceutil.implementer(repository.imanifestrevisionstored)
+class gittreemanifestctx(object):
+    def __init__(self, repo, gittree):
+        self._repo = repo
+        self._tree = gittree
+        self._builder = None
+
+    def _getbuilder(self):
+        if self._builder is None:
+            self._builder = self._repo.TreeBuilder(self._tree)
+        return self._builder
+
+    def read(self):
+        return gittreemanifest(self._tree, self._getbuilder)
+
+    def find(self, path):
+        self.read()[path]
+
+    def copy(self):
+        return gittreemanifestctx(self._repo, self._tree)
+
+    def write(self, transaction, link, p1, p2, added, removed, match=None):
+        # We're not (for now, anyway) going to audit filenames, so we
+        # can ignore added and removed.
+
+        # TODO what does this match argument get used for? hopefully
+        # just narrow?
+        assert not match or isinstance(match, matchmod.alwaysmatcher)
+        return self._getbuilder().write().raw
+
+class manifestlog(baselog):
+
+    def __getitem__(self, node):
+        return self.get('', node)
+
+    def get(self, relpath, node):
+        if node == nodemod.nullid:
+            return manifest.memtreemanifestctx(self, relpath)
+        commit = self.gitrepo[nodemod.hex(node)]
+        t = commit.tree
+        if relpath:
+            parts = relpath.split('/')
+            for p in parts:
+                te = t[p]
+                t = self.gitrepo[te.id]
+        return gittreemanifestctx(self.gitrepo, t)
+
+ at interfaceutil.implementer(repository.ifilestorage)
+class filelog(baselog):
+    def __init__(self, gr, db, path):
+        super(filelog, self).__init__(gr, db)
+        self.path = path
+
+    def read(self, node):
+        return self.gitrepo[nodemod.hex(node)].data
+
+    def lookup(self, node):
+        if len(node) not in (20, 40):
+            node = int(node)
+        if isinstance(node, int):
+            assert False, 'todo revnums for nodes'
+        if len(node) == 40:
+            hnode = node
+            node = nodemod.bin(node)
+        else:
+            hnode = nodemod.hex(node)
+        if hnode in self.gitrepo:
+            return node
+        raise error.LookupError(self.path, node, _('no match found'))
+
+    def cmp(self, node, text):
+        """Returns True if text is different than content at `node`."""
+        return self.read(node) != text
+
+    def add(self, text, meta, transaction, link, p1=None, p2=None):
+        assert not meta  # Should we even try to handle this?
+        return self.gitrepo.create_blob(text).raw
+
+    def __iter__(self):
+        for clrev in self._db.execute('''
+SELECT rev FROM changelog
+INNER JOIN changedfiles ON changelog.node = changedfiles.node
+WHERE changedfiles.filename = ? AND changedfiles.filenode != ?
+        ''', (self.path, nodemod.nullhex)):
+            yield clrev[0]
+
+    def linkrev(self, fr):
+        return fr
+
+    def rev(self, node):
+        return int(self._db.execute('''
+SELECT rev FROM changelog
+INNER JOIN changedfiles ON changelog.node = changedfiles.node
+WHERE changedfiles.filename = ? AND changedfiles.filenode = ?''', (
+    self.path, nodemod.hex(node))).fetchone()[0])
+
+    def node(self, rev):
+        return nodemod.bin(self._db.execute(
+'''SELECT filenode FROM changedfiles
+INNER JOIN changelog ON changelog.node = changedfiles.node
+WHERE changelog.rev = ? AND filename = ?
+''', (rev, self.path)).fetchone()[0])
diff --git a/hgext/git/dirstate.py b/hgext/git/dirstate.py
new file mode 100644
--- /dev/null
+++ b/hgext/git/dirstate.py
@@ -0,0 +1,249 @@
+from __future__ import absolute_import
+
+import contextlib
+import errno
+import os
+import stat
+
+from mercurial import (
+    dirstate,
+    error,
+    extensions,
+    match as matchmod,
+    node as nodemod,
+    scmutil,
+    util,
+)
+from mercurial.interfaces import (
+    dirstate as intdirstate,
+    util as interfaceutil,
+)
+from mercurial.i18n import _
+
+import pygit2
+
+
+def readpatternfile(orig, filepath, warn, sourceinfo=False):
+    if not ('info/exclude' in filepath or filepath.endswith('.gitignore')):
+        return orig(filepath, warn, sourceinfo=False)
+    result = []
+    warnings = []
+    with open(filepath, 'rb') as fp:
+        for l in fp:
+            l = l.strip()
+            if not l or l.startswith('#'):
+                continue
+            if l.startswith('!'):
+                # on reflection, I think /foo is just glob:
+                warnings.append('unsupported ignore pattern %s' % l)
+                continue
+            if l.startswith('/'):
+              result.append('glob:' + l[1:])
+            else:
+              result.append('relglob:' + l)
+    return result, warnings
+extensions.wrapfunction(matchmod, 'readpatternfile', readpatternfile)
+
+
+_STATUS_MAP = {
+    pygit2.GIT_STATUS_CONFLICTED: 'm',
+    pygit2.GIT_STATUS_CURRENT: 'n',
+    pygit2.GIT_STATUS_IGNORED: '?',
+    pygit2.GIT_STATUS_INDEX_DELETED: 'r',
+    pygit2.GIT_STATUS_INDEX_MODIFIED: 'n',
+    pygit2.GIT_STATUS_INDEX_NEW: 'a',
+    pygit2.GIT_STATUS_INDEX_RENAMED: 'a',
+    pygit2.GIT_STATUS_INDEX_TYPECHANGE: 'n',
+    pygit2.GIT_STATUS_WT_DELETED: 'r',
+    pygit2.GIT_STATUS_WT_MODIFIED: 'n',
+    pygit2.GIT_STATUS_WT_NEW: '?',
+    pygit2.GIT_STATUS_WT_RENAMED: 'a',
+    pygit2.GIT_STATUS_WT_TYPECHANGE: 'n',
+    pygit2.GIT_STATUS_WT_UNREADABLE: '?',
+}
+
+
+ at interfaceutil.implementer(intdirstate.idirstate)
+class gitdirstate(object):
+
+    def __init__(self, ui, root, gitrepo):
+        self._ui = ui
+        self._root = os.path.dirname(root)
+        self.git = gitrepo
+
+    def p1(self):
+        return self.git.head.peel().id.raw
+
+    def p2(self):
+        # TODO: MERGE_HEAD? something like that, right?
+        return nodemod.nullid
+
+    def setparents(self, p1, p2=nodemod.nullid):
+        assert p2 == nodemod.nullid, 'TODO merging support'
+        self.git.head.set_target(nodemod.hex(p1))
+
+    @util.propertycache
+    def identity(self):
+        self.identity = util.filestat.frompath(
+            os.path.join(self.root, '.git', 'index'))
+
+    def branch(self):
+        return b'default'
+
+    def parents(self):
+        # TODO how on earth do we find p2 if a merge is in flight?
+        return self.p1(), nodemod.nullid
+
+    def __iter__(self):
+        # TODO is this going to give us unicodes on py3?
+        return (f.path for f in self.git.index)
+
+    def items(self):
+        for ie in self.git.index:
+            yield ie.path, None # value should be a dirstatetuple
+
+    # py2,3 compat forward
+    iteritems = items
+
+    def __getitem__(self, filename):
+        try:
+            gs = self.git.status_file(filename)
+        except KeyError:
+            return '?'
+        return _STATUS_MAP[gs]
+
+    def __contains__(self, filename):
+        try:
+            gs = self.git.status_file(filename)
+            return _STATUS_MAP[gs] != '?'
+        except KeyError:
+            return False
+
+    def status(self, match, subrepos, ignored, clean, unknown):
+        # TODO handling of clean files - can we get that from git.status()?
+        modified, added, removed, deleted, unknown, ignored, clean = (
+            [], [], [], [], [], [], [])
+        gstatus = self.git.status()
+        for path, status in gstatus.items():
+            if status == pygit2.GIT_STATUS_IGNORED:
+                if path.endswith('/'):
+                    continue
+                ignored.append(path)
+            elif status in (pygit2.GIT_STATUS_WT_MODIFIED,
+                            pygit2.GIT_STATUS_INDEX_MODIFIED,
+                            pygit2.GIT_STATUS_WT_MODIFIED|pygit2.GIT_STATUS_INDEX_MODIFIED):
+                modified.append(path)
+            elif status == pygit2.GIT_STATUS_INDEX_NEW:
+                added.append(path)
+            elif status == pygit2.GIT_STATUS_WT_NEW:
+                unknown.append(path)
+            elif status == pygit2.GIT_STATUS_WT_DELETED:
+                deleted.append(path)
+            elif status == pygit2.GIT_STATUS_INDEX_DELETED:
+                removed.append(path)
+            else:
+                raise error.Abort('unhandled case: status for %r is %r' % (
+                    path, status))
+
+        # TODO are we really always sure of status here?
+        return False, scmutil.status(
+            modified, added, removed, deleted, unknown, ignored, clean)
+
+    def flagfunc(self, buildfallback):
+        # TODO we can do better
+        return buildfallback()
+
+    def getcwd(self):
+        # TODO is this a good way to do this?
+        return os.path.dirname(os.path.dirname(self.git.path))
+
+    def normalize(self, path):
+        assert util.normcase(path) == path, 'TODO handling of case folding'
+        return path
+
+    @property
+    def _checklink(self):
+        return util.checklink(os.path.dirname(self.git.path))
+
+    def copies(self):
+        # TODO support copies?
+        return {}
+
+    # # TODO what the heck is this
+    _filecache = set()
+
+    def pendingparentchange(self):
+        # TODO: we need to implement the context manager bits and
+        # correctly stage/revert index edits.
+        return False
+
+    def write(self, tr):
+
+        if tr:
+
+            def writeinner(category):
+                self.git.index.write()
+
+            tr.addpending('gitdirstate', writeinner)
+        else:
+            self.git.index.write()
+
+    def pathto(self, f, cwd=None):
+        if cwd == None:
+            cwd = self.getcwd()
+        # TODO core dirstate does something about slashes here
+        r = util.pathto(self._root, cwd, f)
+        return r
+
+    def normal(self, f, parentfiledata=None):
+        """Mark a file normal and clean."""
+        # TODO: for now we just let libgit2 re-stat the file. We can
+        # clearly do better.
+
+    def normallookup(self, f):
+        """Mark a file normal, but possibly dirty."""
+        # TODO: for now we just let libgit2 re-stat the file. We can
+        # clearly do better.
+
+    def walk(self, match, subrepos, unknown, ignored, full=True):
+        # TODO: we need to use .status() and not iterate the index,
+        # because the index doesn't force a re-walk and so `hg add` of
+        # a new file without an intervening call to status will
+        # silently do nothing.
+        r = {}
+        cwd = self.getcwd()
+        for path, status in self.git.status().items():
+            if path.startswith('.hg/'):
+                continue
+            # TODO construct the stat info from the status object?
+            try:
+                s = os.stat(os.path.join(cwd, path))
+            except OSError as e:
+                if e.errno != errno.ENOENT:
+                    raise
+                continue
+            r[path] = s
+        return r
+
+    def savebackup(self, tr, backupname):
+        # TODO: figure out a strategy for saving index backups.
+        pass
+
+    def restorebackup(self, tr, backupname):
+        # TODO: figure out a strategy for saving index backups.
+        pass
+
+    def add(self, f):
+        self.git.index.add(f)
+
+    def drop(self, f):
+        self.git.index.remove(f)
+
+    def copied(self, path):
+        # TODO: track copies?
+        return None
+
+    @contextlib.contextmanager
+    def parentchange(self):
+        # TODO: track this maybe?
+        yield
diff --git a/hgext/git/__init__.py b/hgext/git/__init__.py
new file mode 100644
--- /dev/null
+++ b/hgext/git/__init__.py
@@ -0,0 +1,144 @@
+"""Grant Mercurial the ability to operate on Git repositories. (EXPERIMENTAL)
+
+This is currently super experimental. It probably will consume your
+firstborn a la Rumpelstiltskin, etc.
+"""
+
+from __future__ import absolute_import
+
+import os
+
+from mercurial import (
+    commands,
+    debugcommands,
+    extensions,
+    hg,
+    localrepo,
+    store,
+)
+from mercurial.interfaces import (
+    repository,
+    util as interfaceutil,
+)
+
+from . import (
+    dirstate,
+    gitlog,
+    index,
+)
+
+import pygit2
+
+# TODO: extract an interface for this in core
+class gitstore(object): # store.basicstore):
+    def __init__(self, path, vfstype):
+        self.vfs = vfstype(path)
+        self.path = self.vfs.base
+        self.createmode = store._calcmode(self.vfs)
+        # above lines should go away in favor of:
+        # super(gitstore, self).__init__(path, vfstype)
+
+        self.git = pygit2.Repository(os.path.normpath(
+            os.path.join(path, '..', '.git')))
+        self._db = index.get_index(self.git)
+
+    def join(self, f):
+        """Fake store.join method for git repositories.
+
+        For the most part, store.join is used for @storecache
+        decorators to invalidate caches when various files
+        change. We'll map the ones we care about, and ignore the rest.
+        """
+        if f in ('00changelog.i', '00manifest.i'):
+            # This is close enough: in order for the changelog cache
+            # to be invalidated, HEAD will have to change.
+            return os.path.join(self.path, 'HEAD')
+        elif f == 'lock':
+            # TODO: we probably want to map this to a git lock, I
+            # suspect index.lock. We should figure out what the
+            # most-alike file is in git-land. For now we're risking
+            # bad concurrency errors if another git client is used.
+            return os.path.join(self.path, 'hgit-bogus-lock')
+        elif f in ('obsstore', 'phaseroots', 'narrowspec', 'bookmarks'):
+            return os.path.join(self.path, '..', '.hg', f)
+        raise NotImplementedError('Need to pick file for %s.' % f)
+
+    def changelog(self, trypending):
+        # TODO we don't have a plan for trypending in hg's git support yet
+        return gitlog.changelog(self.git, self._db)
+
+    def manifestlog(self, repo, storenarrowmatch):
+        # TODO handle storenarrowmatch and figure out if we need the repo arg
+        return gitlog.manifestlog(self.git, self._db)
+
+    def invalidatecaches(self):
+        pass
+
+    def write(self, tr=None):
+        # normally this handles things like fncache writes, which we don't have
+        pass
+
+def _makestore(orig, requirements, storebasepath, vfstype):
+    if (os.path.exists(os.path.join(storebasepath, 'this-is-git'))
+        and os.path.exists(os.path.join(storebasepath, '..', '.git'))):
+        return gitstore(storebasepath, vfstype)
+    return orig(requirements, storebasepath, vfstype)
+
+class gitfilestorage(object):
+    def file(self, path):
+        if path[0:1] == b'/':
+            path = path[1:]
+        return gitlog.filelog(self.store.git, self.store._db, path)
+
+def _makefilestorage(orig, requirements, features, **kwargs):
+    store = kwargs['store']
+    if isinstance(store, gitstore):
+        return gitfilestorage
+    return orig(requirements, features, **kwargs)
+
+def _setupdothg(ui, path):
+    dothg = os.path.join(path, '.hg')
+    if os.path.exists(dothg):
+        ui.warn('git repo already initialized for hg\n')
+    else:
+        os.mkdir(os.path.join(path, b'.hg'))
+        # TODO is it ok to extend .git/info/exclude like this?
+        with open(os.path.join(path, b'.git',
+                               b'info', b'exclude'), 'ab') as exclude:
+            exclude.write(b'\n.hg\n')
+    with open(os.path.join(dothg, b'this-is-git'), 'w') as f:
+        pass
+    with open(os.path.join(dothg, b'requirements'), 'w') as f:
+        f.write(b'git\n')
+
+def init(orig, ui, dest='.', **opts):
+    if opts.get('git', False):
+        inited = False
+        path = os.path.abspath(dest)
+        # TODO: walk up looking for the git repo
+        gr = pygit2.Repository(os.path.join(path, '.git'))
+        _setupdothg(ui, path)
+        return 0 # debugcommands.debugrebuilddirstate(
+            # ui, hg.repository(ui, path), rev='.')
+    return orig(ui, dest=dest, **opts)
+
+def reposetup(ui, repo):
+    if isinstance(repo.store, gitstore):
+        orig = repo.__class__
+
+        class gitlocalrepo(orig):
+
+            def _makedirstate(self):
+                # TODO narrow support here
+                return dirstate.gitdirstate(
+                    self.ui, self.vfs.base, repo.store.git)
+
+        repo.__class__ = gitlocalrepo
+    return repo
+
+def extsetup(ui):
+    extensions.wrapfunction(localrepo, 'makestore', _makestore)
+    extensions.wrapfunction(localrepo, 'makefilestorage', _makefilestorage)
+    # Inject --git flag for `hg init`
+    entry = extensions.wrapcommand(commands.table, 'init', init)
+    entry[1].extend([('', 'git', None, 'setup up a git repository instead of hg')])



To: durin42, #hg-reviewers
Cc: mjpieters, mercurial-devel


More information about the Mercurial-devel mailing list