diff --git a/Pipfile b/Pipfile index 8f5ac10d8..eb98e3895 100644 --- a/Pipfile +++ b/Pipfile @@ -48,7 +48,7 @@ six = "*" # yajl-py depends on it but doesn't declare so. https://github.com/py pyroute2 = "*" pyspawner = "*" pyicu = "*" -cjwmodule = "~=1.1.3" +cjwmodule = "==1.2.1" [dev-packages] watchdog = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 5923c6262..897d12952 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "49ebcc074bd07cbf2290a4da19767f1f70db769b55c0c654df80e5a986008536" + "sha256": "25dae15a5c0a4f9c2eebc5173090d043b35707ec541f252193c1c4c07fc66550" }, "pipfile-spec": 6, "requires": { @@ -75,30 +75,30 @@ }, "asyncpg": { "hashes": [ - "sha256:08599fbe81b254a28944cd60c0047f650b51acaa7ad396ae6f44e92d13dc9cb7", - "sha256:0e4e5ed6134434fbc932dc069f07c326e3e4c128472f1145d25a7e68a60352bc", - "sha256:14b73360da5b4febe3cb8d2f18148196843a4d4719f8db8412e2b858527473d7", - "sha256:160925de20f3b3828654a0145992de966dd2b7e1d6bd3aa7204d7880bf848f70", - "sha256:265ba356e216c1a96f490b35b138321d2e6dcf40302e2cd0df3b474a42644ae4", - "sha256:2a8f85029271c517e79e8068fdc32e37349a3fd45a3b592063dfc19dfa2ddadd", - "sha256:3d25a38326dc5184d014acaa4ab799cdfd4704eefc6d5ab0f9e3be395ea86e7d", - "sha256:4ac7fa44803ba32ac421900016690df3147ad6b732360162701c2fbb9aaa043a", - "sha256:64aa8330eb585b3804b298d0b5b1bc8d72b9345cce4f2e956b0ab85b1463b9f7", - "sha256:6a2a96d17e129ae7501c3fc323dc8d47d292113aa6e65c7c613e4e8374fb096e", - "sha256:90d10514e2814c981b104b97d93decc154ffe63e4e3f6e9bb80df1bdac433fba", - "sha256:9f82f5ab0db3c081c57bc9a4709891ecb3dccd8ce42a4de86c94e923ecefab1b", - "sha256:aa02dce5d5b801cff7dd2d99b767f5db312858f527ec1764777aad1fdefb5a7a", - "sha256:ab8ea6a13ac536b171780024b099f5190c22fab93b6f787e2cf3b39c66d7a687", - "sha256:b0ba2ae7b4a3d40611388e406bdda5c36bb580787c8fe70b28a33476347c699a", - "sha256:b32a03b8b18f96ea00711d495cf824bcf9d88e5268e89b26a7ec6cbf2a20f1dc", - "sha256:ba17521b1a338051bd274dadf16f4fd04568a43d3a873d6d3daac6169f626bac", - "sha256:c6c2c502a454417c03a13a99667b922afe3389249080a5dbeae130594c97eb8f", - "sha256:cc0882dfd860063d351680494050a92c9915d4693620357cf0d3b56f8153996f", - "sha256:e6efb9752dc86a5321eaf565969c15d32cfb702c6f25c2356eea629264d99d3d", - "sha256:e70b41e7ec05d77042f12a67cdc3cc76a0c21718d1e82ee35e0c58dc9ddf387d" + "sha256:058baec9d6b75612412baa872a1aa47317d0ff88c318a49f9c4a2389043d5a8d", + "sha256:0c336903c3b08e970f8af2f606332f1738dba156bca83ed0467dc2f5c70da796", + "sha256:1388caa456070dab102be874205e3ae8fd1de2577d5de9fa22e65ba5c0f8b110", + "sha256:25edb0b947eb632b6b53e5a4b36cba5677297bb34cbaba270019714d0a5fed76", + "sha256:2af6a5a705accd36e13292ea43d08c20b15e52d684beb522cb3a7d3c9c8f3f48", + "sha256:391aea89871df8c1560750af6c7170f2772c2d133b34772acf3637e3cf4db93e", + "sha256:394bf19bdddbba07a38cd6fb526ebf66e120444d6b3097332b78efd5b26495b0", + "sha256:5664d1bd8abe64fc60a0e701eb85fa1d8c9a4a8018a5a59164d27238f2caf395", + "sha256:57666dfae38f4dbf84ffbf0c5c0f78733fef0e8e083230275dcb9ccad1d5ee09", + "sha256:74510234c294c6a6767089ba9c938f09a491426c24405634eb357bd91dffd734", + "sha256:95cd2df61ee00b789bdcd04a080e6d9188693b841db2bf9a87ebaed9e53147e0", + "sha256:a981500bf6947926e53c48f4d60ae080af1b4ad7fa78e363465a5b5ad4f2b65e", + "sha256:a9e6fd6f0f9e8bd77e9a4e1ef9a4f83a80674d9136a754ae3603e915da96b627", + "sha256:ad5ba062e09673b1a4b8d0facaf5a6d9719bf7b337440d10b07fe994d90a9552", + "sha256:ba90d3578bc6dddcbce461875672fd9bdb34f0b8215b68612dd3b65a956ff51c", + "sha256:c773c7dbe2f4d3ebc9e3030e94303e45d6742e6c2fc25da0c46a56ea3d83caeb", + "sha256:da238592235717419a6a7b5edc8564da410ebfd056ca4ecc41e70b1b5df86fba", + "sha256:e39aac2b3a2f839ce65aa255ce416de899c58b7d38d601d24ca35558e13b48e3", + "sha256:ec6e7046c98730cb2ba4df41387e10cb8963a3ac2918f69ae416f8aab9ca7b1b", + "sha256:f0c9719ac00615f097fe91082b785bce36dbf02a5ec4115ede0ebfd2cd9500cb", + "sha256:f7184689177eeb5a11fa1b2baf3f6f2e26bfd7a85acf4de1a3adbd0867d7c0e2" ], "index": "pypi", - "version": "==0.20.0" + "version": "==0.20.1" }, "attrs": { "hashes": [ @@ -109,10 +109,10 @@ }, "autobahn": { "hashes": [ - "sha256:64fa063b3a1ab16588037d4713f13f66167f7ad2a2e95fd675decbc3bc85c089", - "sha256:986e284d32cefc7174849bf2ae3fe8999ce9903b5c455bce4623f119eb9bea15" + "sha256:407f588342dc0ccb46edf0523302ea1a912c11278f722756a1e29af533cd451e", + "sha256:5a1564e26a6cd598945676d504ede43a43d29389ab93db6d6abfba362df1851a" ], - "version": "==19.11.2" + "version": "==20.1.2" }, "automat": { "hashes": [ @@ -140,18 +140,18 @@ }, "boto3": { "hashes": [ - "sha256:5c00d51101d6a7ddf2207ae8a738e5c815c5fcffbee76121f38bd41d83c936a5", - "sha256:aa58c8de6aed36211e0897598de2a3d89122ad8cd1450165679720180ab880ef" + "sha256:916efdecda945daba08dca9032373440db681e53e6b46e7d0cae104ffd5ac7ca", + "sha256:925899699b81977b75d4036fc478209916f4547f8683e5c7b1ad678317a6652e" ], "index": "pypi", - "version": "==1.10.50" + "version": "==1.11.3" }, "botocore": { "hashes": [ - "sha256:765a5c637ff792239727c327b221ed5a4d851e9f176ce8b8b9eca536425c74d4", - "sha256:adb4cb188cd0866e7337f9a049fc68db042b0340fd496d40bca349c8dbfc6a2d" + "sha256:799562d2af023f49518676c6b72ee07325da027513406b4b9d8e5b74ecea8257", + "sha256:80e6522c4bb8d98dab9fd77708c6722bee775c7b85a77326ef44543062a17462" ], - "version": "==1.13.50" + "version": "==1.14.3" }, "cchardet": { "hashes": [ @@ -251,10 +251,10 @@ }, "cjwmodule": { "hashes": [ - "sha256:8b517642f8e556a6aa63cbebb801f1903db70023f642831f9090f4ec1f76cf37" + "sha256:89d052bbe1fcf02dd63a2369f9baf8ec3df4f2a60a09f12799a2a6ee20a25b1d" ], "index": "pypi", - "version": "==1.1.3" + "version": "==1.2.1" }, "constantly": { "hashes": [ @@ -408,6 +408,20 @@ "index": "pypi", "version": "==2.1.11" }, + "h11": { + "hashes": [ + "sha256:33d4bca7be0fa039f4e84d50ab00531047e53d6ee8ffbc83501ea602c169cae1", + "sha256:4bc6d6a1238b7615b266ada57e0618568066f57dd6fa967d1290ec9309b2f2f1" + ], + "version": "==0.9.0" + }, + "h2": { + "hashes": [ + "sha256:ac377fcf586314ef3177bfd90c12c7826ab0840edeb03f0f24f511858326049e", + "sha256:b8a32bd282594424c0ac55845377eea13fa54fe4a8db012f3a198ed923dc3ab4" + ], + "version": "==3.1.1" + }, "hext": { "hashes": [ "sha256:0859e493bcb89cb24ec054424b6d64c53be33b3e602fe43fb97da1b5b5fe34b2", @@ -426,6 +440,19 @@ "index": "pypi", "version": "==0.2.0" }, + "hpack": { + "hashes": [ + "sha256:0edd79eda27a53ba5be2dfabf3b15780928a0dff6eb0c60a3d6767720e970c89", + "sha256:8eec9c1f4bfae3408a3f30500261f7e6a65912dc138526ea054f9ad98892e9d2" + ], + "version": "==3.0.0" + }, + "hstspreload": { + "hashes": [ + "sha256:7ec1d33da7da00a79aed79e995b151f81824a5d4aac0547557da348711292d46" + ], + "version": "==2020.1.15" + }, "html5lib": { "hashes": [ "sha256:20b159aa3badc9d5ee8f5c647e5efd02ed2a66ab8d354930bd9ff139fc1dc0a3", @@ -434,6 +461,20 @@ "index": "pypi", "version": "==1.0.1" }, + "httpx": { + "hashes": [ + "sha256:2f2a7ce82cd4373af5a72d2646e20e4061da36b637ebcb88b93265083f132bff", + "sha256:abc46081611a86270d92c8fcaaf148f1eb33510c92096075fac27288bea3a9b2" + ], + "version": "==0.11.0" + }, + "hyperframe": { + "hashes": [ + "sha256:5187962cb16dcc078f23cb5a4b110098d546c3f41ff2d4038a9896893bbd0b40", + "sha256:a9f5c17f2cc3c719b917c4f33ed1c61bd1f8dfac4b1bd23b7c80b3400971b41f" + ], + "version": "==5.2.0" + }, "hyperlink": { "hashes": [ "sha256:4288e34705da077fada1111a24a0aa08bb1e76699c9ce49876af722441845654", @@ -536,25 +577,25 @@ }, "multidict": { "hashes": [ - "sha256:0f04bf4c15d8417401a10a650c349ccc0285943681bfd87d3690587d7714a9b4", - "sha256:15a61c0df2d32487e06f6084eabb48fd9e8b848315e397781a70caf9670c9d78", - "sha256:3c5e2dcbe6b04cbb4303e47a896757a77b676c5e5db5528be7ff92f97ba7ab95", - "sha256:5d2b32b890d9e933d3ced417924261802a857abdee9507b68c75014482145c03", - "sha256:5e5fb8bfebf87f2e210306bf9dd8de2f1af6782b8b78e814060ae9254ab1f297", - "sha256:63ba2be08d82ea2aa8b0f7942a74af4908664d26cb4ff60c58eadb1e33e7da00", - "sha256:73740fcdb38f0adcec85e97db7557615b50ec4e5a3e73e35878720bcee963382", - "sha256:78bed18e7f1eb21f3d10ff3acde900b4d630098648fe1d65bb4abfb3e22c4900", - "sha256:a02fade7b5476c4f88efe9593ff2f3286698d8c6d715ba4f426954f73f382026", - "sha256:aacbde3a8875352a640efa2d1b96e5244a29b0f8df79cbf1ec6470e86fd84697", - "sha256:be813fb9e5ce41a5a99a29cdb857144a1bd6670883586f995b940a4878dc5238", - "sha256:bfcad6da0b8839f01a819602aaa5c5a5b4c85ecbfae9b261a31df3d9262fb31e", - "sha256:c2bfc0db3166e68515bc4a2b9164f4f75ae9c793e9635f8651f2c9ffc65c8dad", - "sha256:c66d11870ae066499a3541963e6ce18512ca827c2aaeaa2f4e37501cee39ac5d", - "sha256:cc7f2202b753f880c2e4123f9aacfdb94560ba893e692d24af271dac41f8b8d9", - "sha256:d1f45e5bb126662ba66ee579831ce8837b1fd978115c9657e32eb3c75b92973d", - "sha256:ed5f3378c102257df9e2dc9ce6468dabf68bee9ec34969cfdc472631aba00316" - ], - "version": "==4.7.3" + "sha256:13f3ebdb5693944f52faa7b2065b751cb7e578b8dd0a5bb8e4ab05ad0188b85e", + "sha256:26502cefa86d79b86752e96639352c7247846515c864d7c2eb85d036752b643c", + "sha256:4fba5204d32d5c52439f88437d33ad14b5f228e25072a192453f658bddfe45a7", + "sha256:527124ef435f39a37b279653ad0238ff606b58328ca7989a6df372fd75d7fe26", + "sha256:5414f388ffd78c57e77bd253cf829373721f450613de53dc85a08e34d806e8eb", + "sha256:5eee66f882ab35674944dfa0d28b57fa51e160b4dce0ce19e47f495fdae70703", + "sha256:63810343ea07f5cd86ba66ab66706243a6f5af075eea50c01e39b4ad6bc3c57a", + "sha256:6bd10adf9f0d6a98ccc792ab6f83d18674775986ba9bacd376b643fe35633357", + "sha256:83c6ddf0add57c6b8a7de0bc7e2d656be3eefeff7c922af9a9aae7e49f225625", + "sha256:93166e0f5379cf6cd29746989f8a594fa7204dcae2e9335ddba39c870a287e1c", + "sha256:9a7b115ee0b9b92d10ebc246811d8f55d0c57e82dbb6a26b23c9a9a6ad40ce0c", + "sha256:a38baa3046cce174a07a59952c9f876ae8875ef3559709639c17fdf21f7b30dd", + "sha256:a6d219f49821f4b2c85c6d426346a5d84dab6daa6f85ca3da6c00ed05b54022d", + "sha256:a8ed33e8f9b67e3b592c56567135bb42e7e0e97417a4b6a771e60898dfd5182b", + "sha256:d7d428488c67b09b26928950a395e41cc72bb9c3d5abfe9f0521940ee4f796d4", + "sha256:dcfed56aa085b89d644af17442cdc2debaa73388feba4b8026446d168ca8dad7", + "sha256:f29b885e4903bd57a7789f09fe9d60b6475a6c1a4c0eca874d8558f00f9d4b51" + ], + "version": "==4.7.4" }, "networkx": { "hashes": [ @@ -735,10 +776,10 @@ }, "pyhamcrest": { "hashes": [ - "sha256:6b672c02fdf7470df9674ab82263841ce8333fb143f32f021f6cb26f0e512420", - "sha256:8ffaa0a53da57e89de14ced7185ac746227a8894dbd5a3c718bf05ddbd1d56cd" + "sha256:36cda5e849e3ee9ce54628667471e5e769f596259bb3c13c3596f6059a615ef8", + "sha256:5959cb4ab465b303522d2e0d270a6ee581c3ad9ba419e304bb6ebe50a60ea37a" ], - "version": "==1.9.0" + "version": "==2.0.0" }, "pyicu": { "hashes": [ @@ -787,7 +828,6 @@ "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a" ], - "markers": "python_version >= '2.7'", "version": "==2.8.1" }, "python-http-client": { @@ -886,6 +926,13 @@ "index": "pypi", "version": "==1.3.0" }, + "rfc3986": { + "hashes": [ + "sha256:0344d0bd428126ce554e7ca2b61787b6a28d2bbd19fc70ed2dd85efe31176405", + "sha256:df4eba676077cefb86450c8f60121b9ae04b94f65f85b69f3f731af0516b7b18" + ], + "version": "==1.3.2" + }, "rfc3987": { "hashes": [ "sha256:10702b1e51e5658843460b189b185c0366d2cf4cff716f13111b0ea9fd2dce53", @@ -896,10 +943,10 @@ }, "s3transfer": { "hashes": [ - "sha256:6efc926738a3cd576c2a79725fed9afde92378aa5c6a957e3af010cb019fac9d", - "sha256:b780f2411b824cb541dbcd2c713d0cb61c7d1bcadae204cdddda2b35cef493ba" + "sha256:2157640a47c8b8fa2071bdd7b0d57378ec8957eede3bd083949c2dcc4d9b0dd4", + "sha256:e3343ae0f371781c17590cf06cb818a54484fbac9a65a5be7603a39b0a6d7b31" ], - "version": "==0.2.1" + "version": "==0.3.0" }, "schedula": { "hashes": [ @@ -931,11 +978,11 @@ }, "six": { "hashes": [ - "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", - "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66" + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" ], "index": "pypi", - "version": "==1.13.0" + "version": "==1.14.0" }, "smmap2": { "hashes": [ @@ -944,6 +991,13 @@ ], "version": "==2.0.5" }, + "sniffio": { + "hashes": [ + "sha256:20ed6d5b46f8ae136d00b9dcb807615d83ed82ceea6b2058cecb696765246da5", + "sha256:8e3810100f69fe0edd463d02ad407112542a11ffdc29f67db2bf3771afb87a21" + ], + "version": "==1.1.0" + }, "soupsieve": { "hashes": [ "sha256:bdb0d917b03a1369ce964056fc195cfdff8819c40de04695a80bc813c3cfa1f5", @@ -1004,7 +1058,6 @@ "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293", "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745" ], - "markers": "python_version >= '3.4'", "version": "==1.25.7" }, "webencodings": { diff --git a/cjwkernel/pandas/http/errors.py b/cjwkernel/pandas/http/errors.py deleted file mode 100644 index 16403d91a..000000000 --- a/cjwkernel/pandas/http/errors.py +++ /dev/null @@ -1,59 +0,0 @@ -from cjwkernel.types import I18nMessage - - -__all__ = ["HttpError"] - - -class HttpError(Exception): - """ - An HTTP request did not complete. - """ - - @property - def i18n_message(self) -> I18nMessage: - return I18nMessage.TODO_i18n(self.args[0]) - - -class HttpErrorTimeout(HttpError): - def __init__(self): - return super().__init__("HTTP request timed out.") - - -class HttpErrorInvalidUrl(HttpError): - def __init__(self): - return super().__init__( - "Invalid URL. Please supply a valid URL, starting with http:// or https://." - ) - - -class HttpErrorTooManyRedirects(HttpError): - def __init__(self): - return super().__init__( - "HTTP server(s) redirected us too many times. Please try a different URL." - ) - - -class HttpErrorClientResponseError(HttpError): - # override - @property - def i18n_message(self) -> I18nMessage: - return I18nMessage.TODO_i18n( - "Error from server: HTTP %d %s" - % (self.__cause__.status, self.__cause__.message) - ) - - -class HttpErrorClientError(HttpError): - # override - @property - def i18n_message(self) -> I18nMessage: - return I18nMessage.TODO_i18n( - "Error during HTTP request: %s" % str(self.__cause__) - ) - - -HttpError.Timeout = HttpErrorTimeout -HttpError.ClientError = HttpErrorClientError -HttpError.InvalidUrl = HttpErrorInvalidUrl -HttpError.ClientResponseError = HttpErrorClientResponseError -HttpError.TooManyRedirects = HttpErrorTooManyRedirects diff --git a/cjwkernel/pandas/http/httpfile.py b/cjwkernel/pandas/http/httpfile.py deleted file mode 100644 index c7b37cdc3..000000000 --- a/cjwkernel/pandas/http/httpfile.py +++ /dev/null @@ -1,227 +0,0 @@ -r""" -Fetch data using HTTP, then parse it. - -Behavior --------- - -a. Perform an HTTP request and log traffic (gzipped) to output_file -b. If the server responds with a redirect, truncate output_file and restart -c. If there's an error or timeout, truncate output_file and raise error - -File format ------------ - -In case of success, a special HTTP log is written to a *gzipped* file: - - {"url":"https://example.com/test.csv"}\r\n - 200 OK\r\n - Response-Header-1: X\r\n - Response-Header-2: Y\r\n - \r\n - All body bytes - -`Transfer-Encoding`, `Content-Encoding` and `Content-Length` headers are -renamed `Cjw-Original-Transfer-Encoding`, `Cjw-Original-Content-Encoding` -and `Cjw-Original-Content-Length`. (The body in the HTTP log is dechunked -and decompressed, because Python's ecosystem doesn't have nice options for -storing raw HTTP traffic and dechunking from a file.) - -The params in the first line are UTF-8-encoded with no added whitespace -(so "\r\n" cannot appear); status is ASCII-encoded; headers are -latin1-encoded; the body is raw. (Rationale: each encoding is the content's -native encoding.) - -SECURITY: we don't store the request HTTP headers, because they may contain -OAuth tokens or API keys. - -Rationale for storing params: it avoids a race in which we render with new -params but an old fetch result. We only store the "fetch params" ("url"), -not the "render params" ("has_header"), so the file is byte-for-byte identical -when we fetch an unchanged URL with new render params. - -In case of redirect, only the last request is logged. -""" - -import asyncio -import contextlib -import json -import gzip -from pathlib import Path -import shutil -import ssl -from typing import ContextManager, Dict, BinaryIO, Optional, Tuple -import aiohttp -from .errors import HttpError -from .. import moduleutils -from cjwkernel.util import tempfile_context - - -__all__ = ["read", "write", "download", "extract_first_header_from_str"] - - -def write(f: BinaryIO, url: str, headers: Dict[str, str], body: BinaryIO) -> None: - """ - Write httpfile-formatted data to `f`. - - This module's docstring describes the file format. - """ - # set gzip mtime=0 so we can write the exact same file given the exact - # same data. (This helps with testing and versioning.) - with gzip.GzipFile(mode="wb", filename="", fileobj=f, mtime=0) as zf: - # Write URL -- original URL, not redirected URL - zf.write( - json.dumps( - {"url": url}, # SECURITY: don't store "headers" secrets - ensure_ascii=False, - allow_nan=False, - separators=(",", ":"), - sort_keys=True, - ).encode("utf-8") - + b"\r\n" - ) - # Write status line -- INCORRECT but oh well - zf.write(b"200 OK\r\n") - # Write response headers. - # - # Ideally we'd be using raw headers. But moduleutils gives - # parsed headers. Let's not bother with purity: just - # re-encode the parsed headers. - for k, v in headers.items(): - # bytesio is already dechunked and decompressed. Mangle - # these headers to make file consistent with itself. - if k.lower() in {"transfer-encoding", "content-encoding", "content-length"}: - k = "Cjw-Original-" + k - elif k.lower() not in {"content-type", "content-disposition", "server"}: - # Skip writing most headers. This is a HACK: we skip the - # `Date` header so fetcher will see a byte-for-byte - # identical output file given byte-for-byte identical - # input. That will convince fetcher to ignore the result. - # See `fetcher.versions`. TODO redefine "versions" and - # revisit this logic: the user probably _expects_ us to - # store headers every fetch, though body may not change. - continue - # There's no way to put \r\n in an HTTP header name or value. - # Good thing: if a server could do that, this file format would - # be unreadable. - assert "\n" not in k and "\n" not in v - zf.write(f"{k}: {v}\r\n".encode("latin1")) - zf.write(b"\r\n") - - # Write body - shutil.copyfileobj(body, zf) - - -async def download( - url: str, - output_path: Path, - *, - headers: Dict[str, str] = {}, - ssl: Optional[ssl.SSLContext] = None, -) -> None: - """ - Download from `url` to an httpfile-format file. - - This module's docstring describes the file format. - - Raise HttpError if download fails. - """ - timeout = aiohttp.ClientTimeout(total=5 * 60, connect=30) - - try: - async with moduleutils.spooled_data_from_url( - url, headers, timeout, ssl=ssl - ) as (bytesio, headers, charset): - # This shouldn't be a context manager. Oh well. Ignore the fact - # that bytesio is backed by a file. It's safe to read the file - # after we exit the context and the file is deleted. - pass - except asyncio.TimeoutError: - output_path.write_bytes(b"") # truncate file - raise HttpError.Timeout - except aiohttp.InvalidURL: - raise HttpError.InvalidUrl - except aiohttp.TooManyRedirects: - raise HttpError.TooManyRedirects - except aiohttp.ClientResponseError as err: - raise HttpError.ClientResponseError from err - except aiohttp.ClientError as err: - raise HttpError.ClientError from err - - # The following shouldn't ever error. - with output_path.open("wb") as f: - write(f, url, headers, bytesio) - - -@contextlib.contextmanager -def read(httpfile_path: Path) -> ContextManager[Tuple[Path, str]]: - r""" - Yield `(body: Path, url: str, headers: str)` by parsing `httpfile_path`. - - The yielded `body` contains the downloaded HTTP body. The body is decoded - according to the HTTP server's `Content-Encoding` and `Transfer-Encoding`. - - The yielded `str` contains HTTP-encoded headers. They are separated by \r\n - and the final line ends with \r\n. Their `Content-Encoding`, - `Transfer-Encoding` and `Content-Length` headers are _not_ prefixed with - `Cjw-Original-`. Do not use these headers to validate `body`, because - `body` is already decoded. - """ - with tempfile_context(prefix="body-") as body_path: - with httpfile_path.open("rb") as f, gzip.GzipFile(mode="rb", fileobj=f) as zf: - # read params (line 1) - fetch_params_json = zf.readline() - fetch_params = json.loads(fetch_params_json) - url = fetch_params["url"] - - # read and skip status (line 2) - zf.readline() - - # read headers (lines ending in "\r\n" plus one final "\r\n") - header_lines = [] - while True: - line = zf.readline().decode("latin1") - if not line.strip(): - # "\r\n" on its own line means "end of headers" - break - if line.startswith("Cjw-Original-"): - line = line[len("Cjw-Original-") :] - header_lines.append(line) - headers = "".join(header_lines) # ends with last header_line's "\r\n" - - # Read body into tempfile - with body_path.open("wb") as body_f: - shutil.copyfileobj(zf, body_f) - - # Yield - yield body_path, url, headers - - -def extract_first_header_from_str(headers: str, header: str) -> Optional[str]: - r""" - Scan `headers` for a (case-insensitive) `header`; return its value. - - `headers` must be in the format yielded by `read()`. - - >>> headers = "Content-Type: text/plain; charset=utf-8\r\nX-Foo: Bar\nX-Foo: Baz" - - Searches are case-insensitive: - >>> extract_first_header_from_str(headers, "content-type") - "text/plain; charset=utf-8" - - If a header is repeated, only the first value is returned: - >>> extract_first_header_from_str(headers, "x-foo") - "Bar" - - If a header is missing, return `None`: - >>> extract_first_header_from_str(headers, "content-length") - None - """ - # Assume headers are well-formed: otherwise we wouldn't have written them - # to the `headers` value we're parsing here. - key = header.upper() - for header_line in headers.split("\r\n"): - if header_line: # last split result is "" - header, value = header_line.split(":", 1) - if header.upper() == key: - return value.strip() - return None diff --git a/cjwkernel/pandas/moduletypes.py b/cjwkernel/pandas/moduletypes.py index 3c39948f7..57302b0bd 100644 --- a/cjwkernel/pandas/moduletypes.py +++ b/cjwkernel/pandas/moduletypes.py @@ -1,6 +1,18 @@ from typing import Any, Dict, Iterable, List, Optional, Union, Tuple -I18nMessage = Tuple[str, Dict[str, Any]] +I18nMessageSource = Dict[str, str] +""" Indicates which catalog we should search to find the text of the message. + - A dict with key `"module"` means it's coming from a module; + for example `{"module": "mymodule"}` means, "search the mymodule module's catalog" + - A dict with key `"library"` indicates it's coming from some of our supported libraries; + for example `{"library": "cjwmodule"}` means, "search the cjwmodule library's catalog" +""" +# We can define the dict more precisely in python 3.8 using a `TypedDict` + + +I18nMessage = Union[ + Tuple[str, Dict[str, Any]], Tuple[str, Dict[str, Any], I18nMessageSource] +] Message = Union[str, I18nMessage] RenderError = Union[Message, Dict[str, Union[Message, List[Any]]]] diff --git a/cjwkernel/pandas/types.py b/cjwkernel/pandas/types.py index 551796962..5d3530e5c 100644 --- a/cjwkernel/pandas/types.py +++ b/cjwkernel/pandas/types.py @@ -411,11 +411,38 @@ class I18nMessage: args: Dict[str, Union[int, float, str]] = field(default_factory=dict) """Arguments (empty if message does not need any -- which is common).""" + source: Optional[Dict[str, str]] = None + """An indication of where the message is coming from. + - `None` means it's coming from workbench itself + - A dict with key `"module"` means it's coming from a module + for example `{"module": "mymodule"}` means, "search the mymodule module's catalog" + - A dict with key `"library"` indicates it's coming from some of our supported libraries; + for example `{"library": "cjwmodule"}` means, "search the cjwmodule library's catalog" + """ + @classmethod def from_arrow(cls, value: atypes.I18nMessage) -> I18nMessage: + if value.source: + if isinstance(value.source, atypes.I18nMessageSource.Module): + return cls(value.id, value.args, {"module": value.source.module_id}) + if isinstance(value.source, atypes.I18nMessageSource.Library): + return cls(value.id, value.args, {"library": value.source.library}) return cls(value.id, value.args) def to_arrow(self) -> atypes.I18nMessage: + if self.source: + if "module" in self.source: + return atypes.I18nMessage( + self.id, + self.args, + atypes.I18nMessageSource.Module(self.source["module"]), + ) + if "library" in self.source: + return atypes.I18nMessage( + self.id, + self.args, + atypes.I18nMessageSource.Library(self.source["library"]), + ) return atypes.I18nMessage(self.id, self.args) @classmethod @@ -431,7 +458,7 @@ def TODO_i18n(cls, text: str) -> I18nMessage: @classmethod def coerce(cls, value: mtypes.Message) -> I18nMessage: - """ Convert an internationalized message as return from modules to an object of this dataclass. + """ Convert an internationalized message as returned from modules to an object of this dataclass. Raises: - ValueError, if the value is a list of the wrong length or if the value is of a non-supported type @@ -440,7 +467,7 @@ def coerce(cls, value: mtypes.Message) -> I18nMessage: if isinstance(value, str): return cls.TODO_i18n(value) elif isinstance(value, tuple): - if len(value) != 2: + if len(value) < 2 or len(value) > 3: raise ValueError( "This tuple cannot be coerced to I18nMessage: %s" % value ) @@ -452,10 +479,30 @@ def coerce(cls, value: mtypes.Message) -> I18nMessage: raise ValueError( "Message arguments must be a dict, got %s" % type(value[1]).__name__ ) - return cls(value[0], value[1]) + if len(value) == 3: + source = value[2] + if not isinstance(source, dict): + raise ValueError( + "Message source must be a dict, got %s" % type(source).__name__ + ) + if not source: + raise ValueError("Message source can't be empty if present") + if len(source) > 1: + raise ValueError( + "Message can only have a single source, got %s" % source + ) + supported_source_keys = {"module", "library"} + if set(source.keys()) - supported_source_keys: + raise ValueError( + "Unknown message source kind %s. Supported kinds: %s." + % (source.keys(), supported_source_keys) + ) + else: + source = None + return cls(value[0], value[1], source) else: raise ValueError( - "%s if of type %s, which cannot be coerced to I18nMessage" + "%s is of type %s, which cannot be coerced to I18nMessage" % (value, type(value).__name__) ) diff --git a/cjwkernel/tests/pandas/test_types.py b/cjwkernel/tests/pandas/test_types.py index c953df9dd..466cbd1c6 100644 --- a/cjwkernel/tests/pandas/test_types.py +++ b/cjwkernel/tests/pandas/test_types.py @@ -235,6 +235,52 @@ def test_coerce_from_dict(self): with self.assertRaises(ValueError): I18nMessage.coerce({"id": "my_id", "arguments": {"hello": "there"}}) + def test_coerce_with_source_none(self): + with self.assertRaises(ValueError): + I18nMessage.coerce(("my_id", {"hello": "there"}, None)), + + def test_coerce_with_source_empty(self): + with self.assertRaises(ValueError): + I18nMessage.coerce(("my_id", {"hello": "there"}, {})), + + def test_coerce_with_source_module(self): + self.assertEqual( + I18nMessage.coerce(("my_id", {"hello": "there"}, {"module": "testmodule"})), + I18nMessage("my_id", {"hello": "there"}, {"module": "testmodule"}), + ) + + def test_coerce_with_source_library(self): + self.assertEqual( + I18nMessage.coerce(("my_id", {"hello": "there"}, {"library": "cjwmodule"})), + I18nMessage("my_id", {"hello": "there"}, {"library": "cjwmodule"}), + ) + + def test_coerce_with_source_error_many_sources(self): + with self.assertRaises(ValueError): + I18nMessage.coerce( + ( + "my_id", + {"hello": "there"}, + {"library": "cjwmodule", "module": "testmodule"}, + ) + ) + + def test_coerce_with_source_error_unsupported_source_kind(self): + with self.assertRaises(ValueError): + I18nMessage.coerce( + ("my_id", {"hello": "there"}, {"mygreatsource": "cjwmodule"}) + ) + + def test_coerce_with_source_error_type_tuple(self): + with self.assertRaises(ValueError): + I18nMessage.coerce( + ("my_id", {"hello": "there"}, ("cjwmodule", "testmodule")) + ) + + def test_coerce_with_source_error_type_string(self): + with self.assertRaises(ValueError): + I18nMessage.coerce(("my_id", {"hello": "there"}, "cjwmodule")) + def test_to_arrow(self): self.assertEqual( I18nMessage("my_id", {"hello": "there"}).to_arrow(), @@ -247,6 +293,54 @@ def test_from_arrow(self): I18nMessage("my_id", {"hello": "there"}), ) + def test_to_arrow_with_source_module(self): + self.assertEqual( + I18nMessage( + "my_id", {"hello": "there"}, {"module": "testmodule"} + ).to_arrow(), + atypes.I18nMessage( + "my_id", + {"hello": "there"}, + atypes.I18nMessageSource.Module("testmodule"), + ), + ) + + def test_from_arrow_with_source_module(self): + self.assertEqual( + I18nMessage.from_arrow( + atypes.I18nMessage( + "my_id", + {"hello": "there"}, + atypes.I18nMessageSource.Module("testmodule"), + ) + ), + I18nMessage("my_id", {"hello": "there"}, {"module": "testmodule"}), + ) + + def test_to_arrow_with_source_library(self): + self.assertEqual( + I18nMessage( + "my_id", {"hello": "there"}, {"library": "cjwmodule"} + ).to_arrow(), + atypes.I18nMessage( + "my_id", + {"hello": "there"}, + atypes.I18nMessageSource.Library("cjwmodule"), + ), + ) + + def test_from_arrow_with_source_library(self): + self.assertEqual( + I18nMessage.from_arrow( + atypes.I18nMessage( + "my_id", + {"hello": "there"}, + atypes.I18nMessageSource.Library("cjwmodule"), + ) + ), + I18nMessage("my_id", {"hello": "there"}, {"library": "cjwmodule"}), + ) + class ProcessResultErrorTests(unittest.TestCase): def test_from_string(self): diff --git a/cjwkernel/tests/test_types.py b/cjwkernel/tests/test_types.py index db118aff8..0c6090245 100644 --- a/cjwkernel/tests/test_types.py +++ b/cjwkernel/tests/test_types.py @@ -284,7 +284,7 @@ def test_params_filename_from_thrift_file_not_found_is_error(self): {"A": ttypes.ParamValue(filename_value="does_not_exist")}, self.basedir ) - def test_i18n_message_from_thrift(self): + def test_i18n_message_from_thrift_source_module(self): self.assertEqual( types.I18nMessage.from_thrift( ttypes.I18nMessage( @@ -294,12 +294,89 @@ def test_i18n_message_from_thrift(self): "b": ttypes.I18nArgument(i32_value=12345678), "c": ttypes.I18nArgument(double_value=0.123), }, + ttypes.I18nMessageSource(module_id="testmodule"), + ) + ), + types.I18nMessage( + "modules.x.y", + {"a": "s", "b": 12345678, "c": 0.123}, + types.I18nMessageSource.Module("testmodule"), + ), + ) + + def test_i18n_message_to_thrift_source_module(self): + self.assertEqual( + types.I18nMessage( + "modules.x.y", + {"a": "s", "b": 12345678, "c": 0.123}, + types.I18nMessageSource.Module("testmodule"), + ).to_thrift(), + ttypes.I18nMessage( + "modules.x.y", + { + "a": ttypes.I18nArgument(string_value="s"), + "b": ttypes.I18nArgument(i32_value=12345678), + "c": ttypes.I18nArgument(double_value=0.123), + }, + ttypes.I18nMessageSource(module_id="testmodule"), + ), + ) + + def test_i18n_message_from_thrift_source_library(self): + self.assertEqual( + types.I18nMessage.from_thrift( + ttypes.I18nMessage( + "modules.x.y", + { + "a": ttypes.I18nArgument(string_value="s"), + "b": ttypes.I18nArgument(i32_value=12345678), + "c": ttypes.I18nArgument(double_value=0.123), + }, + ttypes.I18nMessageSource(library="cjwmodule"), + ) + ), + types.I18nMessage( + "modules.x.y", + {"a": "s", "b": 12345678, "c": 0.123}, + types.I18nMessageSource.Library("cjwmodule"), + ), + ) + + def test_i18n_message_to_thrift_source_library(self): + self.assertEqual( + types.I18nMessage( + "modules.x.y", + {"a": "s", "b": 12345678, "c": 0.123}, + types.I18nMessageSource.Library("cjwmodule"), + ).to_thrift(), + ttypes.I18nMessage( + "modules.x.y", + { + "a": ttypes.I18nArgument(string_value="s"), + "b": ttypes.I18nArgument(i32_value=12345678), + "c": ttypes.I18nArgument(double_value=0.123), + }, + ttypes.I18nMessageSource(library="cjwmodule"), + ), + ) + + def test_i18n_message_from_thrift_source_none(self): + self.assertEqual( + types.I18nMessage.from_thrift( + ttypes.I18nMessage( + "modules.x.y", + { + "a": ttypes.I18nArgument(string_value="s"), + "b": ttypes.I18nArgument(i32_value=12345678), + "c": ttypes.I18nArgument(double_value=0.123), + }, + ttypes.I18nMessageSource(), ) ), types.I18nMessage("modules.x.y", {"a": "s", "b": 12345678, "c": 0.123}), ) - def test_i18n_message_to_thrift(self): + def test_i18n_message_to_thrift_source_none(self): self.assertEqual( types.I18nMessage( "modules.x.y", {"a": "s", "b": 12345678, "c": 0.123} @@ -311,10 +388,71 @@ def test_i18n_message_to_thrift(self): "b": ttypes.I18nArgument(i32_value=12345678), "c": ttypes.I18nArgument(double_value=0.123), }, + ttypes.I18nMessageSource(), ), ) - def test_i18n_message_from_dict(self): + def test_i18n_message_from_dict_source_library(self): + self.assertEqual( + types.I18nMessage.from_dict( + { + "id": "modules.x.y", + "arguments": ["s", 12345678, 0.123], + "source": {"library": "cjwmodule"}, + } + ), + types.I18nMessage( + "modules.x.y", + ["s", 12345678, 0.123], + types.I18nMessageSource.Library("cjwmodule"), + ), + ) + + def test_i18n_message_to_dict_source_library(self): + self.assertEqual( + types.I18nMessage( + "modules.x.y", + ["s", 12345678, 0.123], + types.I18nMessageSource.Library("cjwmodule"), + ).to_dict(), + { + "id": "modules.x.y", + "arguments": ["s", 12345678, 0.123], + "source": {"library": "cjwmodule"}, + }, + ) + + def test_i18n_message_from_dict_source_module(self): + self.assertEqual( + types.I18nMessage.from_dict( + { + "id": "modules.x.y", + "arguments": ["s", 12345678, 0.123], + "source": {"module": "testmodule"}, + } + ), + types.I18nMessage( + "modules.x.y", + ["s", 12345678, 0.123], + types.I18nMessageSource.Module("testmodule"), + ), + ) + + def test_i18n_message_to_dict_source_module(self): + self.assertEqual( + types.I18nMessage( + "modules.x.y", + ["s", 12345678, 0.123], + types.I18nMessageSource.Module("testmodule"), + ).to_dict(), + { + "id": "modules.x.y", + "arguments": ["s", 12345678, 0.123], + "source": {"module": "testmodule"}, + }, + ) + + def test_i18n_message_from_dict_no_source(self): self.assertEqual( types.I18nMessage.from_dict( {"id": "modules.x.y", "arguments": ["s", 12345678, 0.123]} @@ -322,7 +460,7 @@ def test_i18n_message_from_dict(self): types.I18nMessage("modules.x.y", ["s", 12345678, 0.123]), ) - def test_i18n_message_to_dict(self): + def test_i18n_message_to_dict_no_source(self): self.assertEqual( types.I18nMessage("modules.x.y", ["s", 12345678, 0.123]).to_dict(), {"id": "modules.x.y", "arguments": ["s", 12345678, 0.123]}, @@ -376,7 +514,7 @@ def test_quick_fix_from_thrift(self): self.assertEqual( types.QuickFix.from_thrift( ttypes.QuickFix( - ttypes.I18nMessage("click", {}), + ttypes.I18nMessage("click", {}, ttypes.I18nMessageSource()), ttypes.QuickFixAction( prepend_step=ttypes.PrependStepQuickFixAction( "filter", ttypes.RawParams('{"x":"y"}') @@ -397,7 +535,7 @@ def test_quick_fix_to_thrift(self): types.QuickFixAction.PrependStep("filter", {"x": "y"}), ).to_thrift(), ttypes.QuickFix( - ttypes.I18nMessage("click", {}), + ttypes.I18nMessage("click", {}, ttypes.I18nMessageSource()), ttypes.QuickFixAction( prepend_step=ttypes.PrependStepQuickFixAction( "filter", ttypes.RawParams('{"x":"y"}') @@ -444,10 +582,10 @@ def test_render_error_from_thrift(self): self.assertEqual( types.RenderError.from_thrift( ttypes.RenderError( - ttypes.I18nMessage("foo", {}), + ttypes.I18nMessage("foo", {}, ttypes.I18nMessageSource()), [ ttypes.QuickFix( - ttypes.I18nMessage("click", {}), + ttypes.I18nMessage("click", {}, ttypes.I18nMessageSource()), ttypes.QuickFixAction( prepend_step=ttypes.PrependStepQuickFixAction( "filter", ttypes.RawParams('{"x":"y"}') @@ -480,10 +618,10 @@ def test_render_error_to_thrift(self): ], ).to_thrift(), ttypes.RenderError( - ttypes.I18nMessage("foo", {}), + ttypes.I18nMessage("foo", {}, ttypes.I18nMessageSource()), [ ttypes.QuickFix( - ttypes.I18nMessage("click", {}), + ttypes.I18nMessage("click", {}, ttypes.I18nMessageSource()), ttypes.QuickFixAction( prepend_step=ttypes.PrependStepQuickFixAction( "filter", ttypes.RawParams('{"x":"y"}') @@ -501,7 +639,11 @@ def test_render_error_from_dict(self): "message": {"id": "err", "arguments": {}}, "quickFixes": [ { - "buttonText": {"id": "click", "arguments": {}}, + "buttonText": { + "id": "click", + "arguments": {}, + "source": {"library": "cjwmodule"}, + }, "action": { "type": "prependStep", "moduleSlug": "filter", @@ -515,7 +657,9 @@ def test_render_error_from_dict(self): types.I18nMessage("err", {}), [ types.QuickFix( - types.I18nMessage("click"), + types.I18nMessage( + "click", {}, types.I18nMessageSource.Library("cjwmodule") + ), types.QuickFixAction.PrependStep("filter", {"x": "y"}), ) ], @@ -528,7 +672,9 @@ def test_render_error_to_dict(self): types.I18nMessage("err", {}), [ types.QuickFix( - types.I18nMessage("click"), + types.I18nMessage( + "click", {}, types.I18nMessageSource.Library("cjwmodule") + ), types.QuickFixAction.PrependStep("filter", {"x": "y"}), ) ], @@ -537,7 +683,11 @@ def test_render_error_to_dict(self): "message": {"id": "err", "arguments": {}}, "quickFixes": [ { - "buttonText": {"id": "click", "arguments": {}}, + "buttonText": { + "id": "click", + "arguments": {}, + "source": {"library": "cjwmodule"}, + }, "action": { "type": "prependStep", "moduleSlug": "filter", diff --git a/cjwkernel/thrift.thrift b/cjwkernel/thrift.thrift index 6e62ee4db..b10e5afbe 100644 --- a/cjwkernel/thrift.thrift +++ b/cjwkernel/thrift.thrift @@ -192,6 +192,19 @@ union I18nArgument { 3: double double_value } +/** + * Source of a translatable string. + * + * If no field is set, the source is Workbench's own catalog. + */ +union I18nMessageSource { + /** The message comes from the "library" library. */ + 1: string library, + + /** The message comes from the "module_id" module. */ + 2: string module_id +} + /** Translation key and arguments. */ struct I18nMessage { /** Message ID. For instance, `modules.renamecolumns.duplicateColname` */ @@ -203,6 +216,9 @@ struct I18nMessage { * For instance, `{"nColumns": 3, "exampleColumn": "Column X"}` */ 2: map arguments + + /** Pointer to code repository whose catalog translates the message. */ + 3: I18nMessageSource source } /** Instruction that upon clicking a button, Workbench should create a Step. */ diff --git a/cjwkernel/thrift/ttypes.py b/cjwkernel/thrift/ttypes.py index c40235d30..cec806231 100644 --- a/cjwkernel/thrift/ttypes.py +++ b/cjwkernel/thrift/ttypes.py @@ -1163,6 +1163,91 @@ def __ne__(self, other): return not (self == other) +class I18nMessageSource(object): + """ + Source of a translatable string. + + If none of the values are set, this means it's coming from workbench itself + + Attributes: + - library: The message comes from the "library" library + - module_id: The message comes from the "module_id" module + `"module"` is reserved in thrift, that's why we added `"_id"` + + """ + + __slots__ = ( + 'library', + 'module_id', + ) + + + def __init__(self, library=None, module_id=None,): + self.library = library + self.module_id = module_id + + def read(self, iprot): + if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None: + iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec]) + return + iprot.readStructBegin() + while True: + (fname, ftype, fid) = iprot.readFieldBegin() + if ftype == TType.STOP: + break + if fid == 1: + if ftype == TType.STRING: + self.library = iprot.readString().decode('utf-8') if sys.version_info[0] == 2 else iprot.readString() + else: + iprot.skip(ftype) + elif fid == 2: + if ftype == TType.STRING: + self.module_id = iprot.readString().decode('utf-8') if sys.version_info[0] == 2 else iprot.readString() + else: + iprot.skip(ftype) + else: + iprot.skip(ftype) + iprot.readFieldEnd() + iprot.readStructEnd() + + def write(self, oprot): + if oprot._fast_encode is not None and self.thrift_spec is not None: + oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec])) + return + oprot.writeStructBegin('I18nMessageSource') + if self.library is not None: + oprot.writeFieldBegin('library', TType.STRING, 1) + oprot.writeString(self.library.encode('utf-8') if sys.version_info[0] == 2 else self.library) + oprot.writeFieldEnd() + if self.module_id is not None: + oprot.writeFieldBegin('module_id', TType.STRING, 2) + oprot.writeString(self.module_id.encode('utf-8') if sys.version_info[0] == 2 else self.module_id) + oprot.writeFieldEnd() + oprot.writeFieldStop() + oprot.writeStructEnd() + + def validate(self): + return + + def __repr__(self): + L = ['%s=%r' % (key, getattr(self, key)) + for key in self.__slots__] + return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) + + def __eq__(self, other): + if not isinstance(other, self.__class__): + return False + for attr in self.__slots__: + my_val = getattr(self, attr) + other_val = getattr(other, attr) + if my_val != other_val: + return False + return True + + def __ne__(self, other): + return not (self == other) + + class I18nMessage(object): """ Translation key and arguments. @@ -1172,18 +1257,21 @@ class I18nMessage(object): - arguments: Arguments (if Message ID takes any). For instance, `{"nColumns": 3, "exampleColumn": "Column X"}` + - source: An indication of where the message comes from. """ __slots__ = ( 'id', 'arguments', + 'source', ) - def __init__(self, id=None, arguments=None,): + def __init__(self, id=None, arguments=None, source=None,): self.id = id self.arguments = arguments + self.source = source def read(self, iprot): if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None: @@ -1211,6 +1299,12 @@ def read(self, iprot): iprot.readMapEnd() else: iprot.skip(ftype) + elif fid == 3: + if ftype == TType.STRUCT: + self.source = I18nMessageSource() + self.source.read(iprot) + else: + iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() @@ -1233,6 +1327,10 @@ def write(self, oprot): viter31.write(oprot) oprot.writeMapEnd() oprot.writeFieldEnd() + if self.source is not None: + oprot.writeFieldBegin('source', TType.STRUCT, 3) + self.source.write(oprot) + oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() @@ -2220,11 +2318,18 @@ def __ne__(self, other): (2, TType.I32, 'i32_value', None, None, ), # 2 (3, TType.DOUBLE, 'double_value', None, None, ), # 3 ) +all_structs.append(I18nMessageSource) +I18nMessageSource.thrift_spec = ( + None, # 0 + (1, TType.STRING, 'library', 'UTF8', None, ), # 1 + (2, TType.STRING, 'module_id', 'UTF8', None, ), # 2 +) all_structs.append(I18nMessage) I18nMessage.thrift_spec = ( None, # 0 (1, TType.STRING, 'id', 'UTF8', None, ), # 1 (2, TType.MAP, 'arguments', (TType.STRING, 'UTF8', TType.STRUCT, [I18nArgument, None], False), None, ), # 2 + (3, TType.STRUCT, 'source', [I18nMessageSource, None], None, ), # 3 ) all_structs.append(PrependStepQuickFixAction) PrependStepQuickFixAction.thrift_spec = ( diff --git a/cjwkernel/types.py b/cjwkernel/types.py index 50a79c986..9909a90eb 100644 --- a/cjwkernel/types.py +++ b/cjwkernel/types.py @@ -582,6 +582,67 @@ def _i18n_argument_to_thrift(value: Union[str, int, float]) -> ttypes.I18nArgume raise RuntimeError("Unhandled value for I18nArgument: %r" % value) +@dataclass(frozen=True) +class I18nMessageSource: + @classmethod + def from_thrift( + cls, value: ttypes.I18nMessageSource + ) -> Optional[I18nMessageSource]: + if value.module_id is not None: + return I18nMessageSource.Module(value.module_id) + elif value.library is not None: + return I18nMessageSource.Library(value.library) + else: + return None + + @abstractmethod + def to_thrift(self) -> ttypes.I18nMessageSource: + pass + + @classmethod + def from_dict(cls, value: Dict[str, Any]) -> Optional[I18nMessageSource]: + if value.get("module") is not None: + return I18nMessageSource.Module(value["module"]) + elif value.get("library") is not None: + return I18nMessageSource.Library(value["library"]) + else: + return None + + @abstractmethod + def to_dict(self) -> Dict[str, Any]: + pass + + +@dataclass(frozen=True) +class I18nMessageSourceModule(I18nMessageSource): + """An indication that the message is coming from a module.""" + + module_id: str + + def to_thrift(self) -> ttypes.I18nMessageSource: + return ttypes.I18nMessageSource(module_id=self.module_id) + + def to_dict(self) -> Dict[str, Any]: + return {"module": self.module_id} + + +@dataclass(frozen=True) +class I18nMessageSourceLibrary(I18nMessageSource): + """An indication that the message is coming from some of our supported libraries.""" + + library: str + + def to_thrift(self) -> ttypes.I18nMessageSource: + return ttypes.I18nMessageSource(library=self.library) + + def to_dict(self) -> Dict[str, Any]: + return {"library": self.library} + + +I18nMessageSource.Library = I18nMessageSourceLibrary +I18nMessageSource.Module = I18nMessageSourceModule + + @dataclass(frozen=True) class I18nMessage: """Translation key and arguments.""" @@ -592,16 +653,22 @@ class I18nMessage: args: Dict[str, Union[int, float, str]] = field(default_factory=dict) """Arguments (empty if message does not need any -- which is common).""" + source: Optional[I18nMessageSource] = None + """Where the message comes from, or `None` if it comes from Workbench proper.""" + @classmethod def from_thrift(cls, value: ttypes.I18nMessage) -> I18nMessage: return cls( value.id, {k: _i18n_argument_from_thrift(v) for k, v in value.arguments.items()}, + I18nMessageSource.from_thrift(value.source), ) def to_thrift(self) -> ttypes.I18nMessage: return ttypes.I18nMessage( - self.id, {k: _i18n_argument_to_thrift(v) for k, v in self.args.items()} + self.id, + {k: _i18n_argument_to_thrift(v) for k, v in self.args.items()}, + self.source.to_thrift() if self.source else ttypes.I18nMessageSource(), ) @classmethod @@ -634,10 +701,21 @@ def trans( @classmethod def from_dict(cls, value: Dict[str, Any]) -> I18nMessage: - return cls(value["id"], value["arguments"]) + return cls( + value["id"], + value["arguments"], + I18nMessageSource.from_dict(value.get("source", {})), + ) def to_dict(self) -> Dict[str, Any]: - return {"id": self.id, "arguments": self.args} + if self.source: + return { + "id": self.id, + "arguments": self.args, + "source": self.source.to_dict(), + } + else: + return {"id": self.id, "arguments": self.args} ParamValue = Optional[ @@ -754,11 +832,11 @@ def from_dict(cls, value: Dict[str, Any]): else: raise ValueError("Unhandled type in QuickFixAction: %r", value) - @abstractmethod + # override def to_thrift(self) -> ttypes.QuickFixAction: pass - @abstractmethod + # override def to_dict(self) -> Dict[str, Any]: """ Convert to Dict. diff --git a/cjworkbench/tests/i18n/test_trans.py b/cjworkbench/tests/i18n/test_trans.py index 72e177da2..0655da6d8 100644 --- a/cjworkbench/tests/i18n/test_trans.py +++ b/cjworkbench/tests/i18n/test_trans.py @@ -1,7 +1,8 @@ +import logging +from babel.messages.catalog import Catalog from django.test import SimpleTestCase -from cjworkbench.i18n.trans import MessageTranslator from icu import ICUError, InvalidArgsError -from babel.messages.catalog import Catalog +from cjworkbench.i18n.trans import MessageTranslator class TransTest(SimpleTestCase): @@ -59,12 +60,11 @@ def test_default_invalid_parameter_syntax(self): def test_message_invalid_parameter_syntax(self): catalog = Catalog() catalog.add("id", string="Hey {a b}!") - self.assertEqual( - MessageTranslator("en", catalog).trans( + with self.assertLogs(level=logging.ERROR): + result = MessageTranslator("en", catalog).trans( "id", default="Hello {a} {b}", parameters={"a": "you", "b": "!"} - ), - "Hello you !", - ) + ) + self.assertEqual(result, "Hello you !") # Tests that a default message can include a numeric variable def test_default_numeric_parameter(self): @@ -210,12 +210,11 @@ def test_default_invalid_parameter_syntax(self): def test_message_invalid_parameter_syntax(self): catalog = Catalog() catalog.add("id", string="Hey {a b}!") - self.assertEqual( - MessageTranslator("en", catalog).trans_html( + with self.assertLogs(level=logging.ERROR): + result = MessageTranslator("en", catalog).trans_html( "id", default="Hello {a} {b}", parameters={"a": "you", "b": "!"} - ), - "Hello you !", - ) + ) + self.assertEqual(result, "Hello you !") # Tests that a default message can include a numeric variable def test_default_numeric_parameter(self): diff --git a/renderer/execute/wf_module.py b/renderer/execute/wf_module.py index 461a699df..f1a504f62 100644 --- a/renderer/execute/wf_module.py +++ b/renderer/execute/wf_module.py @@ -398,15 +398,3 @@ async def execute_wfmodule( # of future modules, setting their cached_render_result_delta_id = # last_relevant_delta_id? Investigate whether this is worthwhile. return result - - -def build_status_dict(result: RenderResult, delta_id: int) -> Dict[str, Any]: - return { - "output_columns": [c.to_dict() for c in result.table.metadata.columns], - "output_errors": [ - error.to_js_value_unwrapping_TODO_i18n() for error in result.errors - ], - "output_status": result.status, - "output_n_rows": result.table.metadata.n_rows, - "cached_render_result_delta_id": delta_id, - } diff --git a/staticmodules/googlesheets.py b/staticmodules/googlesheets.py index 7215ad105..b96a28319 100644 --- a/staticmodules/googlesheets.py +++ b/staticmodules/googlesheets.py @@ -4,7 +4,7 @@ import os from typing import Any, Dict, List, Optional from oauthlib import oauth2 -from cjwkernel.pandas.http import httpfile +from cjwmodule.http import httpfile, HttpError from cjwkernel.pandas.parse import MimeType, parse_file from cjwkernel.pandas.types import ProcessResult from cjwkernel.pandas import moduleutils @@ -77,29 +77,33 @@ async def do_download( try: await httpfile.download(url, output_path, headers=headers, ssl=SSL_CONTEXT) - except httpfile.HttpError.ClientResponseError as err: - cause = err.__cause__ - if cause.status == 401: + except HttpError.NotSuccess as err: + response = err.response + if response.status_code == 401: return TODO_i18n_fetch_error( output_path, "Invalid credentials. Please reconnect to Google Drive." ) - elif cause.status == 403: + elif response.status_code == 403: return TODO_i18n_fetch_error( output_path, "You chose a file your logged-in user cannot access. Please reconnect to Google Drive or choose a different file.", ) - elif cause.status == 404: + elif response.status_code == 404: return TODO_i18n_fetch_error( output_path, "File not found. Please choose a different file." ) else: - return TODO_i18n_fetch_error( - output_path, - "GDrive responded with HTTP %d %s" % (cause.status, cause.message), + # HACK: *err.i18n_message because i18n_message is a tuple + # compatible with I18nMessage() ctor + return FetchResult( + output_path, errors=[RenderError(I18nMessage(*err.i18n_message))] ) - except httpfile.HttpError as err: - os.truncate(output_path, 0) - return FetchResult(output_path, errors=[RenderError(err.i18n_message)]) + except HttpError as err: + # HACK: *err.i18n_message because i18n_message is a tuple + # compatible with I18nMessage() ctor + return FetchResult( + output_path, errors=[RenderError(I18nMessage(*err.i18n_message))] + ) return FetchResult(output_path) @@ -134,8 +138,8 @@ def _calculate_mime_type(content_type: str) -> MimeType: def _render_file(path: Path, params: Dict[str, Any], output_path: Path): - with httpfile.read(path) as (body_path, url, headers): - content_type = httpfile.extract_first_header_from_str(headers, "Content-Type") + with httpfile.read(path) as (parameters, status_line, headers, body_path): + content_type = httpfile.extract_first_header(headers, "Content-Type") mime_type = _calculate_mime_type(content_type) # Ignore Google-reported charset. Google's headers imply latin-1 when # their data is utf-8. diff --git a/staticmodules/loadurl.py b/staticmodules/loadurl.py index d945d6106..eedf68962 100644 --- a/staticmodules/loadurl.py +++ b/staticmodules/loadurl.py @@ -1,4 +1,4 @@ -r""" +""" Fetch data using HTTP, then parse it. Behavior @@ -28,7 +28,6 @@ from typing import Any, Dict, List, Optional from cjwkernel import parquet from cjwkernel.pandas import moduleutils -from cjwkernel.pandas.http import httpfile from cjwkernel.pandas.parse import parse_file, MimeType from cjwkernel.pandas.types import ProcessResult # deprecated from cjwkernel.types import ( @@ -38,6 +37,7 @@ RenderError, RenderResult, ) +from cjwmodule.http import httpfile, HttpError ExtensionMimeTypes = { @@ -76,7 +76,7 @@ def guess_mime_type_or_none(content_type: str, url: str) -> MimeType: # No match? Check for a known extension in the URL. # ".csv" becomes "text/csv". for extension, mime_type in ExtensionMimeTypes.items(): - if extension in url: + if url.split("?", 1)[0].endswith(extension): return mime_type # We ignored MimeType.TXT before. Check for it now. @@ -122,10 +122,10 @@ def _render_deprecated_parquet( def _render_file(path: Path, output_path: Path, params: Dict[str, Any]): - with httpfile.read(path) as (body_path, url, headers): - content_type = httpfile.extract_first_header_from_str(headers, "Content-Type") + with httpfile.read(path) as (parameters, status_line, headers, body_path): + content_type = httpfile.extract_first_header(headers, "Content-Type") - mime_type = guess_mime_type_or_none(content_type, url) + mime_type = guess_mime_type_or_none(content_type, parameters["url"]) if not mime_type: return RenderResult( errors=[ @@ -184,11 +184,15 @@ def fetch_arrow( ) -> FetchResult: url: str = params["url"].strip() mimetypes = ",".join(v.value for v in AllowedMimeTypes) - headers = {"Accept": mimetypes} + headers = [("Accept", mimetypes)] try: asyncio.run(httpfile.download(url, output_path, headers=headers)) - except httpfile.HttpError as err: - return FetchResult(output_path, errors=[RenderError(err.i18n_message)]) + except HttpError as err: + # HACK: *err.i18n_message because i18n_message is a tuple + # compatible with I18nMessage() ctor + return FetchResult( + output_path, errors=[RenderError(I18nMessage(*err.i18n_message))] + ) return FetchResult(output_path) diff --git a/staticmodules/tests/test_googlesheets.py b/staticmodules/tests/test_googlesheets.py index ccfa18286..6a15e48e9 100644 --- a/staticmodules/tests/test_googlesheets.py +++ b/staticmodules/tests/test_googlesheets.py @@ -1,18 +1,17 @@ import contextlib from http.server import HTTPServer, BaseHTTPRequestHandler import io +import logging from pathlib import Path import unittest import ssl import threading from typing import Any, ContextManager, Dict, Optional -from aiohttp import web import pandas as pd import pyarrow from staticmodules import googlesheets from staticmodules.googlesheets import fetch_arrow, render_arrow, migrate_params from .util import MockHttpResponse, MockParams -from cjwkernel.pandas.http import httpfile from cjwkernel.types import ( ArrowTable, I18nMessage, @@ -22,6 +21,7 @@ ) from cjwkernel.tests.util import assert_arrow_table_equals, parquet_file from cjwkernel.util import tempfile_context +from cjwmodule.http import httpfile expected_table = pd.DataFrame({"foo": [1, 2], "bar": [2, 3]}) @@ -51,27 +51,6 @@ def secrets(secret: Optional[Dict[str, Any]]) -> Dict[str, Any]: return {} -@contextlib.contextmanager -def fetch( - params: Dict[str, Any], secrets: Dict[str, Any] -) -> ContextManager[FetchResult]: - with tempfile_context(prefix="output-") as output_path: - yield fetch_arrow(params, secrets, None, None, output_path) - - -MOCK_ROUTES = { - "csv": ( - "aushwyhtbndh7365YHALsdfsdf987IBHJB98uc9uisdj", - "/drive/v3/files/aushwyhtbndh7365YHALsdfsdf987IBHJB98uc9uisdj?alt=media", - web.Response(body=b"foocsv,bar\n1,2\n2,3"), - pd.DataFrame({"foocsv": [1, 2], "bar": [2, 3]}), - ), - "tsv": (), - "xls": (), - "xlsx": (), -} - - class FetchTests(unittest.TestCase): @classmethod def setUpClass(cls): @@ -127,8 +106,17 @@ def tearDown(self): googlesheets.SSL_CONTEXT = self.old_ssl_context super().tearDown() + @contextlib.contextmanager + def fetch( + self, params: Dict[str, Any], secrets: Dict[str, Any] + ) -> ContextManager[FetchResult]: + with tempfile_context(prefix="output-") as output_path: + with self.assertLogs(level=logging.DEBUG) as cm: + yield fetch_arrow(params, secrets, None, None, output_path) + def test_fetch_nothing(self): - with fetch(P(file=None), secrets={}) as result: + with tempfile_context(prefix="output-") as output_path: + result = fetch_arrow(P(file=None), {}, None, None, output_path) self.assertEqual( result.errors, [RenderError(I18nMessage.TODO_i18n("Please choose a file"))], @@ -139,12 +127,12 @@ def test_fetch_native_sheet(self): self.mock_http_response = MockHttpResponse.ok( body, [("Content-Type", "text/csv")] ) - with fetch(P(), secrets(DEFAULT_SECRET)) as result: + with self.fetch(P(), secrets(DEFAULT_SECRET)) as result: self.assertEqual(result.errors, []) - with httpfile.read(result.path) as (body_path, url, headers): + with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body) self.assertEqual( - headers, "Content-Type: text/csv\r\nContent-Length: 11\r\n" + headers, [("content-type", "text/csv"), ("content-length", "11")] ) self.assertRegex( self.last_http_requestline, "/files/.*/export\?mimeType=text%2Fcsv" @@ -155,14 +143,14 @@ def test_fetch_csv_file(self): self.mock_http_response = MockHttpResponse.ok( body, [("Content-Type", "text/csv")] ) - with fetch( + with self.fetch( P(file={**default_file, "mimeType": "text/csv"}), secrets(DEFAULT_SECRET) ) as result: self.assertEqual(result.errors, []) - with httpfile.read(result.path) as (body_path, url, headers): + with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body) self.assertEqual( - headers, "Content-Type: text/csv\r\nContent-Length: 11\r\n" + headers, [("content-type", "text/csv"), ("content-length", "11")] ) self.assertRegex(self.last_http_requestline, "/files/.*?alt=media") @@ -171,16 +159,19 @@ def test_fetch_tsv_file(self): self.mock_http_response = MockHttpResponse.ok( body, [("Content-Type", "text/tab-separated-values")] ) - with fetch( + with self.fetch( P(file={**default_file, "mimeType": "text/tab-separated-values"}), secrets(DEFAULT_SECRET), ) as result: self.assertEqual(result.errors, []) - with httpfile.read(result.path) as (body_path, url, headers): + with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body) self.assertEqual( headers, - "Content-Type: text/tab-separated-values\r\nContent-Length: 11\r\n", + [ + ("content-type", "text/tab-separated-values"), + ("content-length", "11"), + ], ) self.assertRegex(self.last_http_requestline, "/files/.*?alt=media") @@ -189,16 +180,19 @@ def test_fetch_xls_file(self): self.mock_http_response = MockHttpResponse.ok( body, [("Content-Type", "application/vnd.ms-excel")] ) - with fetch( + with self.fetch( P(file={**default_file, "mimeType": "application/vnd.ms-excel"}), secrets(DEFAULT_SECRET), ) as result: self.assertEqual(result.errors, []) - with httpfile.read(result.path) as (body_path, url, headers): + with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body) self.assertEqual( headers, - "Content-Type: application/vnd.ms-excel\r\nContent-Length: 4\r\n", + [ + ("content-type", "application/vnd.ms-excel"), + ("content-length", "4"), + ], ) self.assertRegex(self.last_http_requestline, "/files/.*?alt=media") @@ -213,7 +207,7 @@ def test_fetch_xlsx_file(self): ) ], ) - with fetch( + with self.fetch( P( file={ **default_file, @@ -223,16 +217,23 @@ def test_fetch_xlsx_file(self): secrets(DEFAULT_SECRET), ) as result: self.assertEqual(result.errors, []) - with httpfile.read(result.path) as (body_path, url, headers): + with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body) self.assertEqual( headers, - "Content-Type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\r\nContent-Length: 4\r\n", + [ + ( + "content-type", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ("content-length", "4"), + ], ) self.assertRegex(self.last_http_requestline, "/files/.*?alt=media") def test_missing_secret_error(self): - with fetch(P(), secrets=secrets(None)) as result: + with tempfile_context() as output_path: + result = fetch_arrow(P(), secrets(None), None, None, output_path) self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, @@ -243,7 +244,7 @@ def test_missing_secret_error(self): def test_invalid_auth_error(self): self.mock_http_response = MockHttpResponse(401) - with fetch(P(), secrets=secrets(DEFAULT_SECRET)) as result: + with self.fetch(P(), secrets=secrets(DEFAULT_SECRET)) as result: self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, @@ -258,7 +259,7 @@ def test_invalid_auth_error(self): def test_not_found(self): self.mock_http_response = MockHttpResponse(404) - with fetch(P(), secrets=secrets(DEFAULT_SECRET)) as result: + with self.fetch(P(), secrets=secrets(DEFAULT_SECRET)) as result: self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, @@ -273,7 +274,7 @@ def test_not_found(self): def test_no_access_error(self): self.mock_http_response = MockHttpResponse(403) - with fetch(P(), secrets=secrets(DEFAULT_SECRET)) as result: + with self.fetch(P(), secrets=secrets(DEFAULT_SECRET)) as result: self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, @@ -292,7 +293,7 @@ def test_unhandled_http_error(self): self.mock_http_response = MockHttpResponse.ok( b"hi", headers=[("Content-Encoding", "gzip")] ) - with fetch(P(), secrets=secrets(DEFAULT_SECRET)) as result: + with self.fetch(P(), secrets=secrets(DEFAULT_SECRET)) as result: self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, @@ -300,7 +301,7 @@ def test_unhandled_http_error(self): RenderError( I18nMessage.TODO_i18n( # googlesheet should pass through aiohttp's message - "Error during HTTP request: 400, message='Can not decode content-encoding: gzip'" + "Error during HTTP request: DecodingError" ) ) ], @@ -379,13 +380,13 @@ def test_render_deprecated_parquet_has_header_false(self): def test_render_has_header_true(self): with tempfile_context("http") as http_path: - with http_path.open("wb") as http_f: - httpfile.write( - http_f, - "https://blah", - {"Content-Type": "text/csv"}, - io.BytesIO(b"A,B\na,b"), - ) + httpfile.write( + http_path, + {"url": "https://blah"}, + "200 OK", + [("content-type", "text/csv")], + io.BytesIO(b"A,B\na,b"), + ) result = render_arrow( ArrowTable(), P(has_header=True), @@ -398,13 +399,13 @@ def test_render_has_header_true(self): def test_render_has_header_false(self): with tempfile_context("http") as http_path: - with http_path.open("wb") as http_f: - httpfile.write( - http_f, - "https://blah", - {"Content-Type": "text/csv"}, - io.BytesIO(b"1,2\n3,4"), - ) + httpfile.write( + http_path, + {"url": "https://blah"}, + "200 OK", + [("content-type", "text/csv")], + io.BytesIO(b"1,2\n3,4"), + ) result = render_arrow( ArrowTable(), P(has_header=False), diff --git a/staticmodules/tests/test_loadurl.py b/staticmodules/tests/test_loadurl.py index 129ab1317..c1d3f7acc 100644 --- a/staticmodules/tests/test_loadurl.py +++ b/staticmodules/tests/test_loadurl.py @@ -3,13 +3,13 @@ from http.server import HTTPServer, BaseHTTPRequestHandler import io import itertools +import logging from pathlib import Path import threading from typing import ContextManager import unittest import zlib import pyarrow -from cjwkernel.pandas.http import httpfile from cjwkernel.util import tempfile_context from cjwkernel.tests.util import assert_arrow_table_equals, parquet_file from cjwkernel.types import ( @@ -19,6 +19,7 @@ RenderError, RenderResult, ) +from cjwmodule.http import httpfile from staticmodules.loadurl import fetch_arrow, render_arrow from .util import MockHttpResponse, MockParams @@ -28,14 +29,6 @@ P = MockParams.factory(url="", has_header=True) -@contextlib.contextmanager -def fetch(url: str = "", has_header: bool = True) -> ContextManager[FetchResult]: - with tempfile_context(prefix="output-") as output_path: - yield fetch_arrow( - {"url": url, "has_header": has_header}, {}, None, None, output_path - ) - - class FetchTests(unittest.TestCase): def setUp(self): super().setUp() @@ -79,6 +72,16 @@ def tearDown(self): self.server_thread.join() super().tearDown() + @contextlib.contextmanager + def fetch( + self, url: str = "", has_header: bool = True + ) -> ContextManager[FetchResult]: + with tempfile_context(prefix="output-") as output_path: + with self.assertLogs(level=logging.DEBUG): + yield fetch_arrow( + {"url": url, "has_header": has_header}, {}, None, None, output_path + ) + def build_url(self, path: str) -> str: """ Build a URL that points to our HTTP server. @@ -91,13 +94,16 @@ def test_fetch_csv(self): self.mock_http_response = MockHttpResponse.ok( body, [("Content-Type", "text/csv; charset=utf-8")] ) - with fetch(url) as result: + with self.fetch(url) as result: self.assertEqual(result.errors, []) - with httpfile.read(result.path) as (body_path, url, headers): + with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body) self.assertEqual( headers, - "Content-Type: text/csv; charset=utf-8\r\nContent-Length: 11\r\n", + [ + ("content-type", "text/csv; charset=utf-8"), + ("content-length", "11"), + ], ) def test_fetch_gzip_encoded_csv(self): @@ -107,24 +113,26 @@ def test_fetch_gzip_encoded_csv(self): gzip.compress(body), [("Content-Type", "text/csv; charset=utf-8"), ("Content-Encoding", "gzip")], ) - with fetch(url) as result: + with self.fetch(url) as result: self.assertEqual(result.errors, []) - with httpfile.read(result.path) as (body_path, url, headers): + with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body) def test_fetch_deflate_encoded_csv(self): body = b"A,B\nx,y\nz,a" + zo = zlib.compressobj(wbits=-zlib.MAX_WBITS) + zbody = zo.compress(body) + zo.flush() url = self.build_url("/path/to.csv.gz") self.mock_http_response = MockHttpResponse.ok( - zlib.compress(body), + zbody, [ ("Content-Type", "text/csv; charset=utf-8"), ("Content-Encoding", "deflate"), ], ) - with fetch(url) as result: + with self.fetch(url) as result: self.assertEqual(result.errors, []) - with httpfile.read(result.path) as (body_path, url, headers): + with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body) def test_fetch_chunked_csv(self): @@ -132,15 +140,15 @@ def test_fetch_chunked_csv(self): [b"A,B\nx", b",y\nz,", b"a"], [("Content-Type", "text/csv; charset=utf-8")] ) url = self.build_url("/path/to.csv.chunks") - with fetch(url) as result: + with self.fetch(url) as result: self.assertEqual(result.errors, []) - with httpfile.read(result.path) as (body_path, url, headers): + with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), b"A,B\nx,y\nz,a") def test_fetch_http_404(self): self.mock_http_response = MockHttpResponse(404, [("Content-Length", 0)]) url = self.build_url("/not-found") - with fetch(url) as result: + with self.fetch(url) as result: self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, @@ -152,7 +160,10 @@ def test_fetch_http_404(self): ) def test_fetch_invalid_url(self): - with fetch("htt://blah") as result: + with tempfile_context(prefix="output-") as output_path: + result = fetch_arrow( + {"url": "htt://blah", "has_header": False}, {}, None, None, output_path + ) self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, @@ -176,11 +187,11 @@ def test_fetch_follow_redirect(self): MockHttpResponse.ok(b"A,B\n1,2", [("Content-Type", "text/csv")]), ] ) - with fetch(url1) as result: + with self.fetch(url1) as result: self.assertEqual(result.errors, []) - with httpfile.read(result.path) as (body_path, url, headers): + with httpfile.read(result.path) as (parameters, __, headers, body_path): self.assertEqual(body_path.read_bytes(), b"A,B\n1,2") - self.assertEqual(url, url1) + self.assertEqual(parameters, {"url": url1}) self.assertIn("/url1.csv", self.http_requestlines[0]) self.assertIn("/url2.csv", self.http_requestlines[1]) self.assertIn("/url3.csv", self.http_requestlines[2]) @@ -194,7 +205,7 @@ def test_redirect_loop(self): MockHttpResponse(302, [("Location", url1)]), ] ) - with fetch(url1) as result: + with self.fetch(url1) as result: self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, @@ -280,13 +291,13 @@ def test_render_deprecated_parquet_has_header_false(self): def test_render_has_header_true(self): with tempfile_context("http") as http_path: - with http_path.open("wb") as http_f: - httpfile.write( - http_f, - "https://blah", - {"Content-Type": "text/csv"}, - io.BytesIO(b"A,B\na,b"), - ) + httpfile.write( + http_path, + {"url": "http://example.com/hello"}, + "200 OK", + [("content-type", "text/csv")], + io.BytesIO(b"A,B\na,b"), + ) result = render_arrow( ArrowTable(), P(has_header=True), @@ -299,13 +310,13 @@ def test_render_has_header_true(self): def test_render_has_header_false(self): with tempfile_context("http") as http_path: - with http_path.open("wb") as http_f: - httpfile.write( - http_f, - "https://blah", - {"Content-Type": "text/csv"}, - io.BytesIO(b"1,2\n3,4"), - ) + httpfile.write( + http_path, + {"url": "http://example.com/hello"}, + "200 OK", + [("content-type", "text/csv")], + io.BytesIO(b"1,2\n3,4"), + ) result = render_arrow( ArrowTable(), P(has_header=False), @@ -331,16 +342,16 @@ def test_render_csv_use_url_ext_given_bad_content_type(self): # Use text/plain type and rely on filename detection, as # https://raw.githubusercontent.com/ does with tempfile_context(prefix="fetch-") as http_path: - with http_path.open("wb") as http_f: - httpfile.write( - http_f, - "https://blah/file.csv", - {"Content-Type": "text/plain"}, - # bytes will prove we used "csv" explicitly -- we didn't - # take "text/plain" and decide to use a CSV sniffer to - # find the delimiter. - io.BytesIO(b"A;B\na;b"), - ) + httpfile.write( + http_path, + {"url": "http://example.com/file.csv"}, + "200 OK", + [("content-type", "text/plain")], + # bytes will prove we used "csv" explicitly -- we didn't + # take "text/plain" and decide to use a CSV sniffer to + # find the delimiter. + io.BytesIO(b"A;B\na;b"), + ) result = render_arrow( ArrowTable(), P(has_header=True), @@ -354,13 +365,13 @@ def test_render_csv_use_url_ext_given_bad_content_type(self): def test_render_text_plain(self): # guess_mime_type_or_none() treats text/plain specially. with tempfile_context(prefix="fetch-") as http_path: - with http_path.open("wb") as http_f: - httpfile.write( - http_f, - "https://blah/file.unknownext", - {"Content-Type": "text/plain"}, - io.BytesIO(b"A;B\na;b"), - ) + httpfile.write( + http_path, + {"url": "http://example.com/file.unknownext"}, + "200 OK", + [("content-type", "text/plain")], + io.BytesIO(b"A;B\na;b"), + ) result = render_arrow( ArrowTable(), P(has_header=True), @@ -377,13 +388,13 @@ def test_render_csv_handle_nonstandard_mime_type(self): # Sysadmins sometimes invent MIME types. We hard-code to rewrite fake # MIME types we've seen in the wild that seem unambiguous. with tempfile_context(prefix="fetch-") as http_path: - with http_path.open("wb") as http_f: - httpfile.write( - http_f, - "https://blah", - {"Content-Type": "application/x-csv"}, - io.BytesIO(b"A,B\na,b"), - ) + httpfile.write( + http_path, + {"url": "http://example.com/hello"}, + "200 OK", + [("content-type", "application/x-csv")], + io.BytesIO(b"A,B\na,b"), + ) result = render_arrow( ArrowTable(), P(has_header=True), @@ -396,13 +407,13 @@ def test_render_csv_handle_nonstandard_mime_type(self): def test_render_json(self): with tempfile_context("fetch-") as http_path: - with http_path.open("wb") as http_f: - httpfile.write( - http_f, - "https://blah", - {"Content-Type": "application/json"}, - io.BytesIO(b'[{"A": "a"}]'), - ) + httpfile.write( + http_path, + {"url": "http://example.com/hello"}, + "200 OK", + [("content-type", "application/json")], + io.BytesIO(b'[{"A": "a"}]'), + ) result = render_arrow( ArrowTable(), P(has_header=True), @@ -415,11 +426,13 @@ def test_render_json(self): def test_render_xlsx(self): with tempfile_context("fetch-") as http_path: - with http_path.open("wb") as http_f, (TestDataPath / "example.xlsx").open( - "rb" - ) as xlsx_f: + with (TestDataPath / "example.xlsx").open("rb") as xlsx_f: httpfile.write( - http_f, "https://blah", {"Content-Type": XLSX_MIME_TYPE}, xlsx_f + http_path, + {"url": "http://example.com/hello"}, + "200 OK", + [("content-type", XLSX_MIME_TYPE)], + xlsx_f, ) result = render_arrow( ArrowTable(), @@ -433,13 +446,13 @@ def test_render_xlsx(self): def test_render_xlsx_bad_content(self): with tempfile_context("fetch-") as http_path: - with http_path.open("wb") as http_f: - httpfile.write( - http_f, - "https://blah", - {"Content-Type": XLSX_MIME_TYPE}, - io.BytesIO("ceçi n'est pas une .xlsx".encode("utf-8")), - ) + httpfile.write( + http_path, + {"url": "http://example.com/hello"}, + "200 OK", + [("content-type", XLSX_MIME_TYPE)], + io.BytesIO("ceçi n'est pas une .xlsx".encode("utf-8")), + ) result = render_arrow( ArrowTable(), P(has_header=True),