[PATCH 1 of 2 RFC] lfs: add support for serving blob files

Sun Feb 18 07:15:31 UTC 2018

# HG changeset patch
# User Matt Harbison <matt_harbison at yahoo.com>
# Date 1518937155 18000
#      Sun Feb 18 01:59:15 2018 -0500
# Node ID ba2e8627d87cfaca00931fe2dcee738c3c9a4f9d
# Parent  868bb2821e292cdda6050f229ade1af42d52c9e6
lfs: add support for serving blob files

There's a ton more to do, especially on the LFS side of things.  But for now,
I'm looking for a sanity check that this is the correct approach.  The protocol
is tied to http, and Gregory's recent refactoring at least gave me a clue where
to start.  But it doesn't quite fit, because the POST requests end up with a
'query' string, so HTTP_NOT_FOUND is returned.  I thought maybe I could just
wrap hgweb._runwsgi() to slurp the requests and bypass the core completely.  But
that's an instance method, and I didn't see a way to ensure every instance could
be subclassed.  That function also does a bit of work to populate the request
object.  So I went back to the new protocolhandler, and hacked the core to not
fail on an LFS handler.  (Assuming that this is generally OK, maybe core could
check an attribute on it to see if it's a native protocol before doing the query
and permission checks?)

The core hasn't been handling PUT requests, which are needed to upload files.  I
tried, but failed subclass the handler from the LFS extension, so I just added
it in core for now.  (I think I know what I did wrong, but it's trivial to add
to core, so IDK how much effort it's worth trying to wrap more stuff to keep it
out of there.)

The code is pretty well marked with TODOs.  I know very little about the
underlying python framework, or how this code fits into `hg serve` and a normal
webserver, so this is the result of a fair amount of trial and error.  On the
plus side, test-lfs-test-server.t can have test-lfs-serve swapped for
`hg serve`, and the test runs, modulo one point where corruption was introduced.
The server should be kicking back an error indicating the corruption, but it
aborts instead (this is already flagged as a TODO).

diff --git a/hgext/lfs/__init__.py b/hgext/lfs/__init__.py
--- a/hgext/lfs/__init__.py
+++ b/hgext/lfs/__init__.py
@@ -148,8 +148,11 @@
     util,
     vfs as vfsmod,
     wireproto,
+    wireprotoserver,
 )
 
+from mercurial.hgweb import server
+
 from . import (
     blobstore,
     wrapper,
@@ -315,6 +318,7 @@
 
     wrapfunction(exchange, 'push', wrapper.push)
     wrapfunction(wireproto, '_capabilities', wrapper._capabilities)
+    wrapfunction(wireprotoserver, 'parsehttprequest', wrapper._parsehttprequest)
 
     wrapfunction(context.basefilectx, 'cmp', wrapper.filectxcmp)
     wrapfunction(context.basefilectx, 'isbinary', wrapper.filectxisbinary)
diff --git a/hgext/lfs/blobstore.py b/hgext/lfs/blobstore.py
--- a/hgext/lfs/blobstore.py
+++ b/hgext/lfs/blobstore.py
@@ -114,13 +114,13 @@
 
         return self.vfs(oid, 'rb')
 
-    def download(self, oid, src):
+    def download(self, oid, src, limit=None):
         """Read the blob from the remote source in chunks, verify the content,
         and write to this local blobstore."""
         sha256 = hashlib.sha256()
 
         with self.vfs(oid, 'wb', atomictemp=True) as fp:
-            for chunk in util.filechunkiter(src, size=1048576):
+            for chunk in util.filechunkiter(src, size=1048576, limit=limit):
                 fp.write(chunk)
                 sha256.update(chunk)
 
diff --git a/hgext/lfs/wrapper.py b/hgext/lfs/wrapper.py
--- a/hgext/lfs/wrapper.py
+++ b/hgext/lfs/wrapper.py
@@ -8,6 +8,7 @@
 from __future__ import absolute_import
 
 import hashlib
+import json
 
 from mercurial.i18n import _
 from mercurial.node import bin, nullid, short
@@ -17,6 +18,13 @@
     filelog,
     revlog,
     util,
+    wireprotoserver,
+)
+
+from mercurial.hgweb.common import (
+    ErrorResponse,
+    HTTP_BAD_REQUEST,
+    HTTP_OK
 )
 
 from ..largefiles import lfutil
@@ -389,3 +397,188 @@
     if 'lfs' in repo.requirements:
         reqs.add('lfs')
     return reqs
+
+#def _runwsgi(self, req, repo):
+
+def _parsehttprequest(orig, repo, req, query):
+    """"""
+    # The batch API URL is the lfs server url + '/objects/batch'.  The default
+    # git URL is:
+    #
+    # Git remote: https://git-server.com/foo/bar
+    # LFS server: https://git-server.com/foo/bar.git/info/lfs
+    # Batch API: https://git-server.com/foo/bar.git/info/lfs/objects/batch
+    #
+    # '.git/' seems like it's not something a user would normally track.  Once
+    # lfs serving is in place, the url can be assumed if it isn't specified.
+    # And if we follow the git convention, maybe the implied lfs server URL also
+    # just works for hg-git.
+    if req.env[r'PATH_INFO'] == '/.git/info/lfs/objects/batch':
+
+        # https://github.com/git-lfs/git-lfs/blob/master/docs/api/batch.md
+        if (req.env[r'REQUEST_METHOD'] == r'POST'
+            and req.env[r'CONTENT_TYPE'] == 'application/vnd.git-lfs+json'
+            and req.env[r'HTTP_ACCEPT'] == 'application/vnd.git-lfs+json'):
+
+            proto = lfsprotocolhandler(req, repo.ui)
+            return {
+                'cmd': 'what command?',
+                'proto': proto,
+                'dispatch': lambda: _batch(repo, req, proto, 'cmdgoeshere'),
+                'handleerror': lambda ex: _handlehttperror(ex, req, 'cmdhere'),
+            }
+        else:
+            # TODO: figure out what the proper handling for a bad request to the
+            #       Batch API is.
+            raise ErrorResponse(HTTP_BAD_REQUEST, 'Invalid Batch API request')
+
+    elif req.env[r'PATH_INFO'].startswith('/.hg/store/lfs/objects'):
+        proto = lfsprotocolhandler(req, repo.ui)
+        return {
+            'cmd': 'what command?',
+            'proto': proto,
+            'dispatch': lambda: _transferfile(repo, req, proto, 'cmdgoeshere'),
+            'handleerror': lambda ex: _handlehttperror(ex, req, 'cmdhere'),
+        }
+
+    return orig(repo, req, query)
+
+def _transferfile(repo, req, proto, cmd):
+    """Handle a single file upload (PUT) or download (GET) action."""
+    import os
+    log = req.env[r'wsgi.errors']
+    #log.write('Got request %s' % req.env)
+
+    method = req.env[r'REQUEST_METHOD']
+    oid = os.path.basename(req.env[r'PATH_INFO'])
+    localstore = repo.svfs.lfslocalblobstore
+
+    if method == r'PUT':
+        length = int(req.env[r'CONTENT_LENGTH'])
+
+        # TODO: verify HTTP_ACCEPT?
+
+        # TODO: how to handle timeouts?  Without passing content-length here,
+        #       the call stalls.  What happens if a client sends less than it
+        #       says it will?
+
+        # TODO: download() will abort if the checksum fails.  It should raise
+        #       something checksum specific that can be caught here, and turned
+        #       into an http code.
+        localstore.download(oid, req, limit=length)
+        req.respond(HTTP_OK, 'text/plain')
+        return []
+    elif method == r'GET':
+        # TODO: figure out how to send back the file in chunks, instead of
+        #       reading the whole thing.
+        req.respond(HTTP_OK, 'application/vnd.git-lfs+json',
+                    body=localstore.read(oid))
+        return []
+    else:
+        raise ErrorResponse(HTTP_BAD_REQUEST,
+                            'Unsupported LFS transfer method: %s' % method)
+
+def _batchresponseobject(req, obj, action, store):
+    """Format an entry in the object list for a Batch API response.
+
+    obj: A single entry in the Batch API object request list
+    action: 'upload' or 'download'
+    store: The local blob store for servicing requests"""
+
+    # Successful lfs-test-server response to solict an upload:
+    # {u'objects': [
+    #      {u'size': 12,
+    #       u'oid': u'31cf...8e5b',
+    #       u'actions': {
+    #           u'upload': {
+    #               u'href': u'http://localhost:$HGPORT/objects/31cf...8e5b',
+    #               u'expires_at': u'0001-01-01T00:00:00Z',
+    #               u'header': {
+    #                   u'Accept': u'application/vnd.git-lfs'
+    #               }
+    #           }
+    #       }
+    #    }]
+    # }
+
+    # TODO: Sort out the expires_at/expires_in/authenticated keys.
+    # TODO: Figure out the proper href URL
+    # TODO: Check the store for corrupted files, and flag as needed
+
+    oid = obj.get('oid')
+    rsp = {
+        'oid': oid,
+        'size': obj.get('size'),
+#        'authenticated': True,
+    }
+
+    if action == 'upload' or store.has(oid):
+        rsp['actions'] = {
+            '%s' % action: {
+                'href': '%s://%s/.hg/store/lfs/objects/%s'
+                         % (req.env[r'wsgi.url_scheme'], req.env[r'HTTP_HOST'],
+                            oid),
+                "expires_at": "2018-11-10T15:29:07Z",
+                'header': {
+                    'Accept': 'application/vnd.git-lfs'
+                }
+            }
+        }
+    else:
+        rsp['error'] = {
+            'code': 404,
+            'message': "The object does not exist"
+        }
+
+    return rsp
+
+def _batch(repo, req, proto, cmd):
+    """Process a Batch API request.  The end result is to tell the client where
+    is can upload/download the files."""
+
+    log = req.env[r'wsgi.errors']
+
+    length = int(req.env[r'CONTENT_LENGTH'])
+    lfsreq = json.loads(req.read(length))
+
+#    log.write('Got request %s' % lfsreq)
+
+    if 'basic' not in lfsreq.get('transfers', ['basic']):
+        raise ErrorResponse(HTTP_BAD_REQUEST,
+                            'Only the basic LFS transfer handler is supported')
+
+    operation = lfsreq.get('operation')
+    if operation not in ('upload', 'download'):
+        raise ErrorResponse(HTTP_BAD_REQUEST,
+                            'Unsupported LFS transfer operation: %s'
+                            % operation )
+
+    localstore = repo.svfs.lfslocalblobstore
+
+    # If uploading, the entry is skipped if it already exists.
+    objects = [_batchresponseobject(req, p, operation, localstore)
+               for p in lfsreq.get('objects') if operation != 'upload'
+                                           or not localstore.has(p.get('oid'))]
+
+    rsp = json.dumps({
+        'transfer': 'basic',
+        'objects': objects,
+    })
+
+    # XXX: content-type and accept?
+    req.respond(HTTP_OK, 'application/vnd.git-lfs+json')
+
+    return rsp
+
+def _handlehttperror(ex, req, cmd):
+    # TODO: ????
+    raise error.Abort('in handle error! %s' % ex)
+
+# TODO: it probably doesn't make much sense to subclass an hg protocol handler
+class lfsprotocolhandler(wireprotoserver.httpv1protocolhandler):
+    def __init__(self, req, ui):
+        wireprotoserver.httpv1protocolhandler.__init__(self, req, ui)
+
+    @property
+    def name(self):
+        return 'http-lfs'
diff --git a/mercurial/hgweb/hgweb_mod.py b/mercurial/hgweb/hgweb_mod.py
--- a/mercurial/hgweb/hgweb_mod.py
+++ b/mercurial/hgweb/hgweb_mod.py
@@ -362,14 +362,16 @@
         protohandler = wireprotoserver.parsehttprequest(rctx.repo, req, query)
 
         if protohandler:
-            cmd = protohandler['cmd']
-            try:
-                if query:
-                    raise ErrorResponse(HTTP_NOT_FOUND)
-                if cmd in perms:
-                    self.check_perm(rctx, req, perms[cmd])
-            except ErrorResponse as inst:
-                return protohandler['handleerror'](inst)
+            # XXX: fix this hack!
+            if protohandler['proto'].name != 'http-lfs':
+                cmd = protohandler['cmd']
+                try:
+                    if query:
+                        raise ErrorResponse(HTTP_NOT_FOUND)
+                    if cmd in perms:
+                        self.check_perm(rctx, req, perms[cmd])
+                except ErrorResponse as inst:
+                    return protohandler['handleerror'](inst)
 
             return protohandler['dispatch']()
 
diff --git a/mercurial/hgweb/server.py b/mercurial/hgweb/server.py
--- a/mercurial/hgweb/server.py
+++ b/mercurial/hgweb/server.py
@@ -111,6 +111,10 @@
             self.log_error(r"Exception happened during processing "
                            r"request '%s':%s%s", self.path, newline, tb)
 
+    def do_PUT(self):
+        #self.log_error('*** handling put request')
+        self.do_POST()
+
     def do_GET(self):
         self.do_POST()