[PATCH] convert: use pygit2 if available, to improve performance

Bryan O'Sullivan bos at serpentine.com
Thu Apr 5 16:53:11 CDT 2012


# HG changeset patch
# User Bryan O'Sullivan <bryano at fb.com>
# Date 1333662769 25200
# Branch stable
# Node ID d6a14f098965261ca2e1722d711bf9f7789ae3c2
# Parent  4d875bb546dc03db33630f5388d7e04939c386a0
convert: use pygit2 if available, to improve performance

When converting a git repository, the main bottleneck is shelling out to
git to retrieve data.  This patch uses the pygit2 bindings to libgit2,
if available, to avoid most callouts to git.  Since the pygit2 bindings
are currently incomplete, we still need to use git for some tasks.

This patch requires version 0.16.1 or newer of pygit2.  If pygit2
is not available or too old, we transparently fall back to invoking
git directly.

diff -r 4d875bb546dc -r d6a14f098965 hgext/convert/git.py
--- a/hgext/convert/git.py	Tue Apr 03 22:01:28 2012 +0200
+++ b/hgext/convert/git.py	Thu Apr 05 14:52:49 2012 -0700
@@ -12,7 +12,22 @@
 
 from common import NoRepo, commit, converter_source, checktool
 
-class convert_git(converter_source):
+class convert_git_base(converter_source):
+    def __init__(self, ui, path, rev):
+        super(convert_git_base, self).__init__(ui, path, rev=rev)
+
+        if os.path.isdir(path + "/.git"):
+            path += "/.git"
+        if not os.path.exists(path + "/objects"):
+            raise NoRepo(_("%s does not look like a Git repository") % path)
+
+        checktool('git', 'git')
+        self.path = path
+
+    # pygit2 0.16.1 has no support for diffing, so we have to shell
+    # out to git for diffs. the methods below would otherwise belong
+    # in convert_git_plain.
+
     # Windows does not support GIT_DIR= construct while other systems
     # cannot remove environment variable. Just assume none have
     # both issues.
@@ -39,47 +54,6 @@
             else:
                 return util.popen('GIT_DIR=%s %s' % (self.path, s), 'rb')
 
-    def gitread(self, s):
-        fh = self.gitopen(s)
-        data = fh.read()
-        return data, fh.close()
-
-    def __init__(self, ui, path, rev=None):
-        super(convert_git, self).__init__(ui, path, rev=rev)
-
-        if os.path.isdir(path + "/.git"):
-            path += "/.git"
-        if not os.path.exists(path + "/objects"):
-            raise NoRepo(_("%s does not look like a Git repository") % path)
-
-        checktool('git', 'git')
-
-        self.path = path
-
-    def getheads(self):
-        if not self.rev:
-            heads, ret = self.gitread('git rev-parse --branches --remotes')
-            heads = heads.splitlines()
-        else:
-            heads, ret = self.gitread("git rev-parse --verify %s" % self.rev)
-            heads = [heads[:-1]]
-        if ret:
-            raise util.Abort(_('cannot retrieve git heads'))
-        return heads
-
-    def catfile(self, rev, type):
-        if rev == hex(nullid):
-            raise IOError()
-        data, ret = self.gitread("git cat-file %s %s" % (type, rev))
-        if ret:
-            raise util.Abort(_('cannot read %r object at %s') % (type, rev))
-        return data
-
-    def getfile(self, name, rev):
-        data = self.catfile(rev, "blob")
-        mode = self.modecache[(name, rev)]
-        return data, mode
-
     def getchanges(self, version):
         self.modecache = {}
         fh = self.gitopen("git diff-tree -z --root -m -r %s" % version)
@@ -108,6 +82,113 @@
             raise util.Abort(_('cannot read changes in %s') % version)
         return (changes, {})
 
+    def getchangedfiles(self, version, i):
+        changes = []
+        if i is None:
+            fh = self.gitopen("git diff-tree --root -m -r %s" % version)
+            for l in fh:
+                if "\t" not in l:
+                    continue
+                m, f = l[:-1].split("\t")
+                changes.append(f)
+        else:
+            fh = self.gitopen('git diff-tree --name-only --root -r %s "%s^%s" --'
+                             % (version, version, i + 1))
+            changes = [f.rstrip('\n') for f in fh]
+        if fh.close():
+            raise util.Abort(_('cannot read changes in %s') % version)
+
+        return changes
+
+def hexoid(obj):
+    # pygit2's "hex" property is unicode, but "oid" is str
+    return obj.oid.encode('hex')
+
+class nopygit2(Exception):
+    pass
+
+class convert_git_pygit2(convert_git_base):
+    def __init__(self, ui, path, rev):
+        super(convert_git_pygit2, self).__init__(ui, path, rev=rev)
+        try:
+            # check for pygit2 0.16.1 or newer
+            import pygit2
+            pygit2.Commit._message
+        except (AttributeError, ImportError):
+            raise nopygit2
+        self.repo = pygit2.init_repository(path, True)
+
+    def getheads(self):
+        if not self.rev:
+            return [hexoid(self.repo.lookup_reference(r).resolve())
+                    for r in self.repo.listall_references()
+                    if (r.startswith('refs/heads/') or
+                        r.startswith('refs/remotes/'))]
+        else:
+            try:
+                return [hexoid(repo.lookup_reference(self.rev).resolve())]
+            except KeyError:
+                return [hexoid(repo[self.rev])]
+
+    def getcommit(self, rev):
+        def prettyname(sig):
+            if sig._name:
+                return '%s <%s>' % (sig._name, sig._email)
+            return sig._email
+
+        c = self.repo[rev.decode('hex')]
+        message = c._message
+        author = prettyname(c.author)
+        committer = prettyname(c.committer)
+        if author != committer:
+            message += '\ncommitter: %s\n' % committer
+        return commit(parents=[hexoid(p) for p in c.parents],
+                      date='%s %s' % (c.commit_time, c.commit_time_offset),
+                      author=author, desc=self.recode(message), rev=rev)
+
+    hex_nullid = hex(nullid)
+
+    def getfile(self, name, rev):
+        if rev == self.hex_nullid:
+            raise IOError
+        return self.repo[rev.decode('hex')].data, self.modecache[(name,rev)]
+
+    def gettags(self):
+        return dict((r.split('/',2)[-1],
+                     hexoid(self.repo.lookup_reference(r).resolve()))
+                    for r in self.repo.listall_references()
+                    if r.startswith('refs/tags/'))
+
+class convert_git_plain(convert_git_base):
+    def gitread(self, s):
+        fh = self.gitopen(s)
+        data = fh.read()
+        return data, fh.close()
+
+    def getheads(self):
+        if not self.rev:
+            heads, ret = self.gitread('git rev-parse --branches --remotes')
+            heads = heads.splitlines()
+        else:
+            heads, ret = self.gitread("git rev-parse --verify %s" % self.rev)
+            heads = [heads[:-1]]
+        if ret:
+            raise util.Abort(_('cannot retrieve git heads'))
+        return heads
+
+    def catfile(self, rev, type):
+        if rev == hex(nullid):
+            raise IOError()
+        data, ret = self.gitread("git cat-file %s %s" % (type, rev))
+        if ret:
+            raise util.Abort(_('cannot read %r object at %s') % (type, rev))
+        return data
+
+    def getfile(self, name, rev):
+        data = self.catfile(rev, "blob")
+        mode = self.modecache[(name, rev)]
+        return data, mode
+
     def getcommit(self, version):
         c = self.catfile(version, "commit") # read the commit hash
         end = c.find("\n\n")
@@ -161,24 +242,6 @@
 
         return tags
 
-    def getchangedfiles(self, version, i):
-        changes = []
-        if i is None:
-            fh = self.gitopen("git diff-tree --root -m -r %s" % version)
-            for l in fh:
-                if "\t" not in l:
-                    continue
-                m, f = l[:-1].split("\t")
-                changes.append(f)
-        else:
-            fh = self.gitopen('git diff-tree --name-only --root -r %s "%s^%s" --'
-                             % (version, version, i + 1))
-            changes = [f.rstrip('\n') for f in fh]
-        if fh.close():
-            raise util.Abort(_('cannot read changes in %s') % version)
-
-        return changes
-
     def getbookmarks(self):
         bookmarks = {}
 
@@ -205,3 +268,9 @@
                 pass
 
         return bookmarks
+
+def convert_git(ui, path, rev=None):
+    try:
+        return convert_git_pygit2(ui, path, rev)
+    except nopygit2:
+        return convert_git_plain(ui, path, rev)


More information about the Mercurial-devel mailing list