jupyter · dolfandringa · May 19, 2022 · May 19, 2022 · May 19, 2022 · May 19, 2022
diff --git a/README.md b/README.md
@@ -46,6 +46,11 @@ docker run -p 8080:8080 -e 'GITHUB_OAUTH_KEY=YOURKEY' \
 Or to use your GitHub personal access token, you can just set `GITHUB_API_TOKEN`.
 
 
+## S3 buckets
+Files in S3 buckets can be access by their s3 uri like `s3://bucket/path/to/key`. This works directly for public buckets. If you want to access private buckets, you need to provide the s3 authentication credentials to the docker container or in your environment. 
+For the docker container this can be done by setting the [environment variables](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#environment-variables) with `-e AWS_ACCESS_KEY_ID=my_secret_id -e AWS_SECRET_ACCESS_KEY=my_secret_key`.
+Or you can provide the [shared credentials file](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#shared-credentials-file) to the user running the nbviewer (in docker with a volume).
+
 ## GitHub Enterprise
 
 To use nbviewer on your own GitHub Enterprise instance you need to set `GITHUB_API_URL`.

diff --git a/nbviewer/app.py b/nbviewer/app.py
@@ -197,6 +197,10 @@ class NBViewer(Application):
         default_value="nbviewer.providers.local.handlers.LocalFileHandler",
         help="The Tornado handler to use for viewing notebooks found on a local filesystem",
     ).tag(config=True)
+    s3_handler = Unicode(
+        default_value="nbviewer.providers.s3.handlers.S3Handler",
+        help="The Tornado handler to use for viewing notebooks from amazon S3",
+    ).tag(config=True)
     url_handler = Unicode(
         default_value="nbviewer.providers.url.handlers.URLHandler",
         help="The Tornado handler to use for viewing notebooks accessed via URL",
@@ -625,6 +629,7 @@ def init_tornado_application(self):
             github_user_handler=self.github_user_handler,
             index_handler=self.index_handler,
             local_handler=self.local_handler,
+            s3_handler=self.s3_handler,
             url_handler=self.url_handler,
             user_gists_handler=self.user_gists_handler,
         )

diff --git a/nbviewer/providers/__init__.py b/nbviewer/providers/__init__.py
@@ -6,12 +6,12 @@
 # -----------------------------------------------------------------------------
 
 default_providers = [
-    "nbviewer.providers.{}".format(prov) for prov in ["url", "github", "gist"]
+    "nbviewer.providers.{}".format(prov) for prov in ["url", "github", "gist", "s3"]
 ]
 
 default_rewrites = [
     "nbviewer.providers.{}".format(prov)
-    for prov in ["gist", "github", "dropbox", "huggingface", "url"]
+    for prov in ["gist", "github", "dropbox", "huggingface", "s3", "url"]
 ]
 
 
@@ -83,7 +83,7 @@ def _load_provider_feature(feature, providers, **handler_names):
         try:
             # Ex: handler_names['url_handler']
             handler_names[provider_handler_key]
-        except KeyError:
+        except KeyError as e:
             continue
         else:
             # Ex: provider_handlers['url_handler'] = handler_names['url_handler']

diff --git a/nbviewer/providers/s3/__init__.py b/nbviewer/providers/s3/__init__.py
@@ -0,0 +1,3 @@
+from .handlers import default_handlers
+from .handlers import S3Handler
+from .handlers import uri_rewrites
diff --git a/nbviewer/providers/s3/handlers.py b/nbviewer/providers/s3/handlers.py
@@ -0,0 +1,149 @@
+# -----------------------------------------------------------------------------
+#  Copyright (C) Jupyter Development Team
+#
+#  Distributed under the terms of the BSD License.  The full license is in
+#  the file COPYING, distributed as part of this software.
+# -----------------------------------------------------------------------------
+import errno
+import io
+import os
+from datetime import datetime
+from urllib.parse import urlparse
+
+import boto3
+import botocore
+from tornado import iostream
+from tornado import web
+
+from .. import _load_handler_from_location
+from ...utils import url_path_join
+from ..base import cached
+from ..base import RenderingHandler
+
+
+class S3Handler(RenderingHandler):
+    """Renderer for s3://
+
+    Serving notebooks from S3 buckets
+    """
+
+    def initialize(self, **kwargs):
+        self.s3_client = boto3.client("s3")
+        self._downloadable_data = None
+        self._downloaded_path = None
+        super().initialize(**kwargs)
+
+    async def download(self, path):
+        """Download the notebook"""
+        headers = await self.get_notebook_headers(path)
+        filename = os.path.basename(path)
+        self.set_header("Content-Length", headers["ContentLength"])
+        # Escape commas to workaround Chrome issue with commas in download filenames
+        self.set_header(
+            "Content-Disposition",
+            "attachment; filename={};".format(filename.replace(",", "_")),
+        )
+        if self._downloaded_path == path and self._downloadable_data is not None:
+            content = self._downloadable_data
+        else:
+            content = await self.read_s3_file(path)
+
+        if isinstance(content, bytes):
+            content = [content]
+        for chunk in content:
+            try:
+                self.write(chunk)
+                await self.flush()
+            except iostream.StreamClosedError:
+                return
+
+    async def get_notebook_data(self, path):
+        """Get additional notebook data"""
+        is_download = self.get_query_arguments("download")
+        if is_download:
+            await self.download(path)
+            return
+
+        return path
+
+    async def get_notebook_headers(self, path):
+        """Get the size of a notebook file."""
+        o = urlparse(path)
+        bucket = o.netloc
+        key = o.path[1:]
+        self.log.debug("Getting headers for %s from %s", key, bucket)
+        try:
+            head = self.s3_client.head_object(Bucket=bucket, Key=key)
+        except botocore.exceptions.ClientError as ex:
+            if ex.response["Error"]["Code"] == "404":
+                self.log.info("The notebook %s does not exist.", path)
+                raise web.HTTPError(404)
+            raise ex
+        return head
+
+    async def read_s3_file(self, path):
+        """Download the notebook file from s3."""
+        o = urlparse(path)
+        bucket = o.netloc
+        key = o.path[1:]
+        s3_file = io.BytesIO()
+        self.log.debug("Reading %s from %s", key, bucket)
+        try:
+            self.s3_client.download_fileobj(bucket, key, s3_file)
+        except botocore.exceptions.ClientError as ex:
+            if ex.response["Error"]["Code"] == "404":
+                self.log.info("The notebook %s does not exist.", path)
+                raise web.HTTPError(404)
+            raise ex
+        s3_file.seek(0)
+        self.log.debug("Done downloading.")
+        self._downloadable_data = s3_file.read().decode("utf-8")
+        self._downloaded_path = path
+        return self._downloadable_data
+
+    async def deliver_notebook(self, path):
+        nbdata = await self.read_s3_file(path)
+
+        # Explanation of some kwargs passed into `finish_notebook`:
+        # breadcrumbs: list of dict
+        #     Breadcrumb 'name' and 'url' to render as links at the top of the notebook page
+        # title: str
+        #     Title to use as the HTML page title (i.e., text on the browser tab)
+        await self.finish_notebook(
+            nbdata,
+            download_url="?download",
+            msg="file from s3: %s" % path,
+            public=False,
+            breadcrumbs=[],
+            title=os.path.basename(path),
+        )
+
+    @cached
+    async def get(self, path):
+        """Get an s3 notebook
+
+        Parameters
+        ==========
+        path: str
+            s3 uri
+        """
+        fullpath = await self.get_notebook_data(path)
+
+        # get_notebook_data returns None if a directory is to be shown or a notebook is to be downloaded,
+        # i.e. if no notebook is supposed to be rendered, making deliver_notebook inappropriate
+        if fullpath is not None:
+            await self.deliver_notebook(fullpath)
+
+
+def default_handlers(handlers=[], **handler_names):
+    """Tornado handlers"""
+
+    s3_handler = _load_handler_from_location(handler_names["s3_handler"])
+
+    return handlers + [(r"/(s3%3A//.*)", s3_handler, {})]
+
+
+def uri_rewrites(rewrites=[]):
+    return [
+        (r"^(s3://.*)$", "{0}"),
+    ]
diff --git a/nbviewer/providers/s3/tests/__init__.py b/nbviewer/providers/s3/tests/__init__.py
diff --git a/nbviewer/providers/s3/tests/test_s3.py b/nbviewer/providers/s3/tests/test_s3.py
@@ -0,0 +1,96 @@
+# -----------------------------------------------------------------------------
+#  Copyright (C) Jupyter Development Team
+#
+#  Distributed under the terms of the BSD License.  The full license is in
+#  the file COPYING, distributed as part of this software.
+# -----------------------------------------------------------------------------
+import io
+import json
+from copy import deepcopy
+from unittest.mock import patch
+
+import boto3
+import requests
+
+from ....tests.base import FormatHTMLMixin
+from ....tests.base import NBViewerTestCase
+
+
+MOCK_NOTEBOOK = {
+    "cells": [
+        {
+            "cell_type": "code",
+            "execution_count": None,
+            "id": "b0939771-a810-4ee0-b440-dbbaeb4f1653",
+            "metadata": {},
+            "outputs": [],
+            "source": [],
+        },
+        {
+            "cell_type": "code",
+            "execution_count": None,
+            "id": "cc0d476a-d09c-4919-8dd2-c8d67f7431b3",
+            "metadata": {},
+            "outputs": [],
+            "source": [],
+        },
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3",
+        },
+        "language_info": {
+            "codemirror_mode": {"name": "ipython", "version": 3},
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.9.12",
+        },
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5,
+}
+
+
+class MockBoto3:
+    def download_fileobj(self, Bucket, Key, fileobj):
+        """Mock downloading fileobjects"""
+        data = deepcopy(MOCK_NOTEBOOK)
+        data["cells"][0]["source"] = [f"print({Bucket})", f"print({Key})"]
+        bin_data = json.dumps(data).encode("utf-8")
+        fileobj.write(bin_data)
+
+    def head_object(self, Bucket, Key):
+        """Mock getting key headers"""
+        output_file = io.BytesIO()
+        f = self.download_fileobj(Bucket, Key, output_file)
+        f.seek(0)
+        return {"ContentLength": len(f.read())}
+
+
+"""
+# This test won't work because the server is started through subprocess.POpen, so we can't mock boto3.
+
+class S3TestCase(NBViewerTestCase):
+
+    @patch("boto3.client")
+    def test_url(self, mock_boto3_client):
+        mockBoto3 = MockBoto3()
+        mock_boto3_client.return_value = mockBoto3
+        with patch.object(mockBoto3, 'download_fileobj') as mock_download:
+            bucket="my_bucket"
+            key="my_file.ipynb"
+            url = self.url(f"s3%3A//{bucket}/{key}")
+            r = requests.get(url)
+            self.assertEqual(r.status_code, 200)
+            args = mock_download.call_args_list[-1][:2]
+            self.assertEqual(args, (bucket, key))
+
+
+class FormatHTMLLocalFileDefaultTestCase(S3TestCase, FormatHTMLMixin):
+    pass
+"""
diff --git a/requirements.in b/requirements.in
@@ -1,5 +1,6 @@
 elasticsearch
 ipython>=8
+boto3
 jupyter_client
 jupyter_server>=0.2.0
 markdown>=3.0,==3.1.1 # pin until we workaround #909, which is a regression in 3.2

diff --git a/requirements.txt b/requirements.txt
@@ -20,6 +20,12 @@ beautifulsoup4==4.11.1
     # via nbconvert
 bleach==5.0.1
     # via nbconvert
+boto3==1.23.3
+    # via -r requirements.in
+botocore==1.26.3
+    # via
+    #   boto3
+    #   s3transfer
 certifi==2022.12.7
     # via elastic-transport
 cffi==1.15.1
@@ -48,6 +54,10 @@ jinja2==3.1.2
     # via
     #   jupyter-server
     #   nbconvert
+jmespath==1.0.0
+    # via
+    #   boto3
+    #   botocore
 jsonschema==4.17.0
     # via nbformat
 jupyter-client==7.4.4
@@ -130,11 +140,15 @@ pyparsing==3.0.9
 pyrsistent==0.19.2
     # via jsonschema
 python-dateutil==2.8.2
-    # via jupyter-client
+    # via
+    #   botocore
+    #   jupyter-client
 pyzmq==24.0.1
     # via
     #   jupyter-client
     #   jupyter-server
+s3transfer==0.5.2
+    # via boto3
 send2trash==1.8.0
     # via jupyter-server
 six==1.16.0
@@ -171,7 +185,9 @@ traitlets==5.5.0
     #   nbconvert
     #   nbformat
 urllib3==1.26.12
-    # via elastic-transport
+    # via
+    #   botocore
+    #   elastic-transport
 wcwidth==0.2.5
     # via prompt-toolkit
 webencodings==0.5.1