From 75fa4244017c78c4e3b2dbdbe2100545858ed092 Mon Sep 17 00:00:00 2001 From: Alex Morling Date: Thu, 12 Sep 2024 23:07:41 +0200 Subject: [PATCH 1/7] add retries and default unzip to False --- ecoscope/io/utils.py | 10 ++++++++-- tests/test_io_utils.py | 25 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/ecoscope/io/utils.py b/ecoscope/io/utils.py index e60c530f..1d98ef49 100644 --- a/ecoscope/io/utils.py +++ b/ecoscope/io/utils.py @@ -6,7 +6,9 @@ import pandas as pd import requests +from requests.adapters import HTTPAdapter from tqdm.auto import tqdm +from urllib3.util import Retry def to_hex(val, default="#ff0000"): @@ -27,18 +29,22 @@ def pack_columns(dataframe: pd.DataFrame, columns: typing.List): return dataframe -def download_file(url, path, overwrite_existing=False, chunk_size=1024, unzip=True, **request_kwargs): +def download_file(url, path, retries=2, overwrite_existing=False, chunk_size=1024, unzip=False, **request_kwargs): """ Download a file from a URL to a local path. If the path is a directory, the filename will be inferred from the response header """ + s = requests.Session() + retries = Retry(total=retries, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]) + s.mount("https://", HTTPAdapter(max_retries=retries)) + if __is_gdrive_url(url): url = __transform_gdrive_url(url) elif __is_dropbox_url(url): url = __transform_dropbox_url(url) - r = requests.get(url, stream=True, **request_kwargs) + r = s.get(url, stream=True, **request_kwargs) if os.path.isdir(path): m = email.message.Message() diff --git a/tests/test_io_utils.py b/tests/test_io_utils.py index 2ca0123b..a84aaad3 100644 --- a/tests/test_io_utils.py +++ b/tests/test_io_utils.py @@ -1,8 +1,12 @@ import json import os +import pytest import fsspec import pandas as pd +from unittest.mock import Mock, patch +from http.client import HTTPMessage +from requests.exceptions import RetryError import ecoscope @@ -80,3 +84,24 @@ def test_download_file_dropbox_share_link(): data = pd.read_csv(os.path.join(output_dir, "download_data.csv")) assert len(data) > 0 + + +@patch("urllib3.connectionpool.HTTPConnectionPool._get_conn") +def test_download_file_retry_on_error(mock): + mock.return_value.getresponse.side_effect = [ + Mock(status=500, msg=HTTPMessage(), headers={}), + Mock(status=504, msg=HTTPMessage(), headers={}), + Mock(status=503, msg=HTTPMessage(), headers={}), + ] + + url = "https://totallyreal.com" + output_dir = "tests/test_output" + + with pytest.raises(RetryError): + ecoscope.io.download_file( + url, + os.path.join(output_dir, "a_file.csv"), + overwrite_existing=True, + ) + + assert mock.call_count == 3 From 5f60c5011d247052ad35829a20d88339a5a092a1 Mon Sep 17 00:00:00 2001 From: Alex Morling Date: Thu, 12 Sep 2024 23:31:27 +0200 Subject: [PATCH 2/7] use get for headers + skip tqdm when content-length is missing --- ecoscope/io/utils.py | 5 +++-- tests/test_io_utils.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ecoscope/io/utils.py b/ecoscope/io/utils.py index 1d98ef49..8c5609bc 100644 --- a/ecoscope/io/utils.py +++ b/ecoscope/io/utils.py @@ -48,7 +48,7 @@ def download_file(url, path, retries=2, overwrite_existing=False, chunk_size=102 if os.path.isdir(path): m = email.message.Message() - m["content-type"] = r.headers["content-disposition"] + m["content-type"] = r.headers.get("content-disposition") filename = m.get_param("filename") if filename is None: raise ValueError("URL has no RFC 6266 filename.") @@ -59,7 +59,8 @@ def download_file(url, path, retries=2, overwrite_existing=False, chunk_size=102 return with open(path, "wb") as f: - with tqdm.wrapattr(f, "write", total=int(r.headers["Content-Length"])) as fout: + content_length = r.headers.get("content-length") + with tqdm.wrapattr(f, "write", total=int(content_length)) if content_length else f as fout: for chunk in r.iter_content(chunk_size=chunk_size): fout.write(chunk) diff --git a/tests/test_io_utils.py b/tests/test_io_utils.py index a84aaad3..8cd739b8 100644 --- a/tests/test_io_utils.py +++ b/tests/test_io_utils.py @@ -100,7 +100,7 @@ def test_download_file_retry_on_error(mock): with pytest.raises(RetryError): ecoscope.io.download_file( url, - os.path.join(output_dir, "a_file.csv"), + output_dir, overwrite_existing=True, ) From 0c260be72546e1a57ec718ae9bea4cb354b53aab Mon Sep 17 00:00:00 2001 From: Alex Morling Date: Fri, 13 Sep 2024 13:53:43 +0200 Subject: [PATCH 3/7] some notebook updates --- doc/source/notebooks/01. IO/GEE_IO.ipynb | 20 +------------------ .../01. IO/Landscape Dynamics Data.ipynb | 18 +---------------- .../04. EcoMap & EcoPlot/EcoPlot.ipynb | 1 - nb-tests/test_notebooks.py | 6 ++---- requirements-notebooks-test.txt | 3 ++- 5 files changed, 6 insertions(+), 42 deletions(-) diff --git a/doc/source/notebooks/01. IO/GEE_IO.ipynb b/doc/source/notebooks/01. IO/GEE_IO.ipynb index 34f591d5..236c44cb 100644 --- a/doc/source/notebooks/01. IO/GEE_IO.ipynb +++ b/doc/source/notebooks/01. IO/GEE_IO.ipynb @@ -38,7 +38,6 @@ "source": [ "import os\n", "import sys\n", - "import zipfile\n", "\n", "import shapely\n", "\n", @@ -208,26 +207,9 @@ "ecoscope.io.utils.download_file(\n", " url=img.getDownloadUrl(download_config),\n", " path=img_zip_file,\n", + " unzip=True,\n", ")" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Unzip" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with zipfile.ZipFile(img_zip_file) as z:\n", - " for name in z.namelist():\n", - " z.extract(name, output_dir)" - ] } ], "metadata": { diff --git a/doc/source/notebooks/01. IO/Landscape Dynamics Data.ipynb b/doc/source/notebooks/01. IO/Landscape Dynamics Data.ipynb index c099ae63..a8e72883 100644 --- a/doc/source/notebooks/01. IO/Landscape Dynamics Data.ipynb +++ b/doc/source/notebooks/01. IO/Landscape Dynamics Data.ipynb @@ -40,7 +40,6 @@ "source": [ "import os\n", "import sys\n", - "import zipfile\n", "\n", "import geopandas as gpd\n", "\n", @@ -90,25 +89,10 @@ " url=\"https://maraelephant.maps.arcgis.com/sharing/rest/content/items/162e299f0c7d472b8e36211e946bb273/data\",\n", " path=output_dir,\n", " overwrite_existing=False,\n", + " unzip=True,\n", ")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Extract ZIP" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "zipfile.ZipFile(os.path.join(output_dir, \"active_public_uncategorized_shpfiles.zip\")).extractall(path=output_dir)" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/doc/source/notebooks/04. EcoMap & EcoPlot/EcoPlot.ipynb b/doc/source/notebooks/04. EcoMap & EcoPlot/EcoPlot.ipynb index bf930a7b..81772c63 100644 --- a/doc/source/notebooks/04. EcoMap & EcoPlot/EcoPlot.ipynb +++ b/doc/source/notebooks/04. EcoMap & EcoPlot/EcoPlot.ipynb @@ -98,7 +98,6 @@ "ecoscope.io.download_file(\n", " f\"{ECOSCOPE_RAW}/tests/sample_data/vector/er_relocs.csv.zip\",\n", " os.path.join(output_dir, \"er_relocs.csv.zip\"),\n", - " unzip=False,\n", ")\n", "\n", "data = pd.read_csv(os.path.join(output_dir, \"er_relocs.csv.zip\"), header=0, index_col=0)\n", diff --git a/nb-tests/test_notebooks.py b/nb-tests/test_notebooks.py index df47e535..3646a22b 100644 --- a/nb-tests/test_notebooks.py +++ b/nb-tests/test_notebooks.py @@ -17,13 +17,11 @@ NB_DIR = pathlib.Path(__file__).parent.parent / "doc" / "source" / "notebooks" KNOWN_ERRORS_REGEXES = { # This is basically a GitHub ticket queue - "EarthRanger_IO.ipynb": "Series found", + # "EarthRanger_IO.ipynb": "Series found", "Relocations_and_Trajectories.ipynb": "No module named 'branca'", - "EcoGraph.ipynb": "not a zip file", - "EcoPlot.ipynb": "not a zip file", + # "EcoGraph.ipynb": "not a zip file", "Landscape Grid.ipynb": "No module named 'branca'", "Seasonal Calculation.ipynb": "No module named 'branca'", - "Tracking Data Gantt Chart.ipynb": "Bad CRC-32 for file 'er_relocs.csv.zip'", "Remote Sensing Time Series Anomaly.ipynb": "No module named 'branca'", "Reduce Regions.ipynb": "No module named 'branca'", "Landscape Dynamics Data.ipynb": "No module named 'branca'", diff --git a/requirements-notebooks-test.txt b/requirements-notebooks-test.txt index eb03be4e..b3f8c43e 100644 --- a/requirements-notebooks-test.txt +++ b/requirements-notebooks-test.txt @@ -3,4 +3,5 @@ pytest papermill .[all] ipykernel -pytest-xdist \ No newline at end of file +pytest-xdist +folium \ No newline at end of file From b6e434c67d2b9a9513c5dff31dfb6a34a6f80057 Mon Sep 17 00:00:00 2001 From: Alex Morling Date: Fri, 13 Sep 2024 21:26:37 +0200 Subject: [PATCH 4/7] date range some er tests --- doc/source/notebooks/01. IO/EarthRanger_IO.ipynb | 6 +++++- nb-tests/test_notebooks.py | 2 -- tests/test_asyncearthranger_io.py | 11 +++++++++-- tests/test_earthranger_io.py | 11 +++++++++-- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/doc/source/notebooks/01. IO/EarthRanger_IO.ipynb b/doc/source/notebooks/01. IO/EarthRanger_IO.ipynb index 8b13905e..63b26a09 100644 --- a/doc/source/notebooks/01. IO/EarthRanger_IO.ipynb +++ b/doc/source/notebooks/01. IO/EarthRanger_IO.ipynb @@ -614,7 +614,11 @@ "metadata": {}, "outputs": [], "source": [ - "patrol_df = er_io.get_patrols()\n", + "patrol_df = er_io.get_patrols(\n", + " since=pd.Timestamp(\"2017-01-01\").isoformat(),\n", + " until=pd.Timestamp(\"2017-04-01\").isoformat(),\n", + ")\n", + "\n", "\n", "relocs = er_io.get_patrol_observations(\n", " patrol_df,\n", diff --git a/nb-tests/test_notebooks.py b/nb-tests/test_notebooks.py index 3646a22b..728b8896 100644 --- a/nb-tests/test_notebooks.py +++ b/nb-tests/test_notebooks.py @@ -17,9 +17,7 @@ NB_DIR = pathlib.Path(__file__).parent.parent / "doc" / "source" / "notebooks" KNOWN_ERRORS_REGEXES = { # This is basically a GitHub ticket queue - # "EarthRanger_IO.ipynb": "Series found", "Relocations_and_Trajectories.ipynb": "No module named 'branca'", - # "EcoGraph.ipynb": "not a zip file", "Landscape Grid.ipynb": "No module named 'branca'", "Seasonal Calculation.ipynb": "No module named 'branca'", "Remote Sensing Time Series Anomaly.ipynb": "No module named 'branca'", diff --git a/tests/test_asyncearthranger_io.py b/tests/test_asyncearthranger_io.py index 4384846f..8eafb57d 100644 --- a/tests/test_asyncearthranger_io.py +++ b/tests/test_asyncearthranger_io.py @@ -218,7 +218,10 @@ async def test_get_patrols(er_io_async, get_patrols_fields): @pytest.mark.asyncio async def test_get_patrol_observations(er_io_async, get_patrol_observations_fields): - observations = await er_io_async.get_patrol_observations_with_patrol_filter() + observations = await er_io_async.get_patrol_observations_with_patrol_filter( + since=pd.Timestamp("2017-01-01").isoformat(), + until=pd.Timestamp("2017-04-01").isoformat(), + ) assert not observations.empty assert set(observations.columns) == set(get_patrol_observations_fields) assert type(observations["fixtime"] == pd.Timestamp) @@ -228,7 +231,11 @@ async def test_get_patrol_observations(er_io_async, get_patrol_observations_fiel async def test_get_patrol_observations_with_patrol_details( er_io_async, get_patrol_observations_fields, get_patrol_details_fields ): - observations = await er_io_async.get_patrol_observations_with_patrol_filter(include_patrol_details=True) + observations = await er_io_async.get_patrol_observations_with_patrol_filter( + since=pd.Timestamp("2017-01-01").isoformat(), + until=pd.Timestamp("2017-04-01").isoformat(), + include_patrol_details=True, + ) assert not observations.empty assert set(observations.columns) == set(get_patrol_observations_fields).union(get_patrol_details_fields) assert type(observations["fixtime"] == pd.Timestamp) diff --git a/tests/test_earthranger_io.py b/tests/test_earthranger_io.py index 8336fd3b..5a374c78 100644 --- a/tests/test_earthranger_io.py +++ b/tests/test_earthranger_io.py @@ -87,7 +87,10 @@ def test_get_patrols(er_io): def test_get_patrol_events(er_io): - events = er_io.get_patrol_events() + events = er_io.get_patrol_events( + since=pd.Timestamp("2017-01-01").isoformat(), + until=pd.Timestamp("2017-04-01").isoformat(), + ) assert "id" in events assert "event_type" in events assert "geometry" in events @@ -196,7 +199,11 @@ def test_patch_event(er_io): def test_get_patrol_observations(er_io): - patrols = er_io.get_patrols() + patrols = er_io.get_patrols( + since=pd.Timestamp("2017-01-01").isoformat(), + until=pd.Timestamp("2017-04-01").isoformat(), + ) + observations = er_io.get_patrol_observations( patrols, include_source_details=False, From 83a865b5f73d5554979aeab0afbc32563ffc9584 Mon Sep 17 00:00:00 2001 From: Alex Morling Date: Fri, 13 Sep 2024 23:10:40 +0200 Subject: [PATCH 5/7] cleanup --- nb-tests/test_notebooks.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/nb-tests/test_notebooks.py b/nb-tests/test_notebooks.py index 728b8896..0e05f485 100644 --- a/nb-tests/test_notebooks.py +++ b/nb-tests/test_notebooks.py @@ -18,11 +18,6 @@ KNOWN_ERRORS_REGEXES = { # This is basically a GitHub ticket queue "Relocations_and_Trajectories.ipynb": "No module named 'branca'", - "Landscape Grid.ipynb": "No module named 'branca'", - "Seasonal Calculation.ipynb": "No module named 'branca'", - "Remote Sensing Time Series Anomaly.ipynb": "No module named 'branca'", - "Reduce Regions.ipynb": "No module named 'branca'", - "Landscape Dynamics Data.ipynb": "No module named 'branca'", } From a018a43381fb961e140a3bd76fa8b1355618c488 Mon Sep 17 00:00:00 2001 From: Alex Morling Date: Sat, 21 Sep 2024 09:34:38 +0200 Subject: [PATCH 6/7] some fixes for notebook failures --- doc/source/notebooks/01. IO/EarthRanger_IO.ipynb | 2 +- .../Relocations_and_Trajectories.ipynb | 2 +- doc/source/notebooks/03. Home Range & Movescape/EcoGraph.ipynb | 2 +- ecoscope/analysis/ecograph.py | 2 +- ecoscope/io/earthranger.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/notebooks/01. IO/EarthRanger_IO.ipynb b/doc/source/notebooks/01. IO/EarthRanger_IO.ipynb index 63b26a09..520459dc 100644 --- a/doc/source/notebooks/01. IO/EarthRanger_IO.ipynb +++ b/doc/source/notebooks/01. IO/EarthRanger_IO.ipynb @@ -714,7 +714,7 @@ "\n", "if not elephants.empty:\n", " for i, value in elephants.iterrows():\n", - " er_io.delete_observation(observation_id=elephants.loc[i, \"extra__id\"])" + " er_io.delete_observation(observation_id=value[\"extra__id\"])" ] }, { diff --git a/doc/source/notebooks/02. Relocations & Trajectories/Relocations_and_Trajectories.ipynb b/doc/source/notebooks/02. Relocations & Trajectories/Relocations_and_Trajectories.ipynb index 3504e798..0d28c290 100644 --- a/doc/source/notebooks/02. Relocations & Trajectories/Relocations_and_Trajectories.ipynb +++ b/doc/source/notebooks/02. Relocations & Trajectories/Relocations_and_Trajectories.ipynb @@ -252,7 +252,7 @@ "metadata": {}, "outputs": [], "source": [ - "relocs[[\"groupby_col\", \"fixtime\", \"geometry\"]].explore()" + "relocs[[\"groupby_col\", \"geometry\"]].explore()" ] }, { diff --git a/doc/source/notebooks/03. Home Range & Movescape/EcoGraph.ipynb b/doc/source/notebooks/03. Home Range & Movescape/EcoGraph.ipynb index d9fe061f..aa8b0466 100644 --- a/doc/source/notebooks/03. Home Range & Movescape/EcoGraph.ipynb +++ b/doc/source/notebooks/03. Home Range & Movescape/EcoGraph.ipynb @@ -546,7 +546,7 @@ " individual=\"1d22ff96-44d4-45c4-adc3-db1513acbe7d\",\n", " interpolation=\"dofjojfs\",\n", " )\n", - "except NotImplemented as e:\n", + "except NotImplementedError as e:\n", " print(e)" ] }, diff --git a/ecoscope/analysis/ecograph.py b/ecoscope/analysis/ecograph.py index ac94f10f..6eb3de0f 100644 --- a/ecoscope/analysis/ecograph.py +++ b/ecoscope/analysis/ecograph.py @@ -85,7 +85,7 @@ def compute(df): G = self._get_ecograph(df, subject_name, radius, cutoff, tortuosity_length) self.graphs[subject_name] = G - self.trajectory.groupby("groupby_col")[self.trajectory.columns].progress_apply(compute) + self.trajectory.groupby("groupby_col")[self.trajectory.columns].apply(compute) def to_csv(self, output_path): """ diff --git a/ecoscope/io/earthranger.py b/ecoscope/io/earthranger.py index b71a83fc..bd0791d7 100644 --- a/ecoscope/io/earthranger.py +++ b/ecoscope/io/earthranger.py @@ -1041,7 +1041,7 @@ def upload(obs): else: return pd.DataFrame(results) - return observations.groupby(source_id_col, group_keys=False).progress_apply(upload) + return observations.groupby(source_id_col, group_keys=False).apply(upload) def post_event( self, From d1fa129f9dd4d08d58c410577a002719c4aed0a6 Mon Sep 17 00:00:00 2001 From: Alex Morling Date: Sat, 21 Sep 2024 11:46:45 +0200 Subject: [PATCH 7/7] more notebook fixes --- doc/source/notebooks/01. IO/EarthRanger_IO.ipynb | 10 +++------- .../03. Home Range & Movescape/EcoGraph.ipynb | 2 +- nb-tests/test_notebooks.py | 4 +--- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/doc/source/notebooks/01. IO/EarthRanger_IO.ipynb b/doc/source/notebooks/01. IO/EarthRanger_IO.ipynb index 520459dc..3ad15f6e 100644 --- a/doc/source/notebooks/01. IO/EarthRanger_IO.ipynb +++ b/doc/source/notebooks/01. IO/EarthRanger_IO.ipynb @@ -713,8 +713,8 @@ ")\n", "\n", "if not elephants.empty:\n", - " for i, value in elephants.iterrows():\n", - " er_io.delete_observation(observation_id=value[\"extra__id\"])" + " for observation_id in elephants[\"extra__id\"].unique():\n", + " er_io.delete_observation(observation_id)" ] }, { @@ -867,11 +867,7 @@ "metadata": {}, "outputs": [], "source": [ - "relocs.drop(\n", - " columns=relocs.columns[relocs.applymap(lambda x: isinstance(x, list)).any()],\n", - " errors=\"ignore\",\n", - " inplace=True,\n", - ")\n", + "relocs = relocs.select_dtypes(exclude=[list])\n", "\n", "relocs.to_file(os.path.join(output_dir, \"observations.gpkg\"), layer=\"observations\")" ] diff --git a/doc/source/notebooks/03. Home Range & Movescape/EcoGraph.ipynb b/doc/source/notebooks/03. Home Range & Movescape/EcoGraph.ipynb index aa8b0466..d3d45642 100644 --- a/doc/source/notebooks/03. Home Range & Movescape/EcoGraph.ipynb +++ b/doc/source/notebooks/03. Home Range & Movescape/EcoGraph.ipynb @@ -152,7 +152,7 @@ "metadata": {}, "outputs": [], "source": [ - "traj.explore()" + "traj[\"geometry\"].explore()" ] }, { diff --git a/nb-tests/test_notebooks.py b/nb-tests/test_notebooks.py index 0e05f485..a26af66d 100644 --- a/nb-tests/test_notebooks.py +++ b/nb-tests/test_notebooks.py @@ -16,9 +16,7 @@ NB_DIR = pathlib.Path(__file__).parent.parent / "doc" / "source" / "notebooks" -KNOWN_ERRORS_REGEXES = { # This is basically a GitHub ticket queue - "Relocations_and_Trajectories.ipynb": "No module named 'branca'", -} +KNOWN_ERRORS_REGEXES = {} # This is basically a GitHub ticket queue @dataclass