Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: unzip support for download #209

Merged
merged 12 commits into from
Oct 31, 2023
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](https://semver.org/).

## [0.X.X] - 2023-XX-XX
## [0.0.6] - 2023-XX-XX
* New Instruments
* MAVEN mag
* MAVEN SEP
* MAVEN in situ key parameters
* REACH Dosimeter
* New Features
* Allow files to be unzipped after download
* Bug Fixes
* Fix general clean routine to skip transformation matrices
* New window needs to be integer for calculate_imf_steadiness
Expand Down
70 changes: 63 additions & 7 deletions pysatNASA/instruments/methods/cdaweb.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
import os
import pandas as pds
import requests
import tempfile
from time import sleep
import xarray as xr
import zipfile

from bs4 import BeautifulSoup
from cdasws import CdasWs
Expand Down Expand Up @@ -471,7 +473,6 @@ def load_xarray(fnames, tag='', inst_id='',
return data, meta


# TODO(#103): Include support to unzip / untar files after download.
def download(date_array, data_path, tag='', inst_id='', supported_tags=None,
remote_url='https://cdaweb.gsfc.nasa.gov'):
"""Download NASA CDAWeb data.
Expand Down Expand Up @@ -514,6 +515,7 @@ def download(date_array, data_path, tag='', inst_id='', supported_tags=None,

"""

# Get information about remote data product location
inst_dict = try_inst_dict(inst_id, tag, supported_tags)

# Naming scheme for files on the CDAWeb server
Expand All @@ -526,6 +528,13 @@ def download(date_array, data_path, tag='', inst_id='', supported_tags=None,
start=date_array[0],
stop=date_array[-1])

# Create temproary directory if files need to be unzipped.
aburrell marked this conversation as resolved.
Show resolved Hide resolved
if 'zip_method' in inst_dict.keys():
zip_method = inst_dict['zip_method']
temp_dir = tempfile.TemporaryDirectory()
else:
zip_method = None

# Download only requested files that exist remotely
for date, fname in remote_files.items():
# Format files for specific dates and download location
Expand All @@ -548,18 +557,19 @@ def download(date_array, data_path, tag='', inst_id='', supported_tags=None,
formatted_remote_dir.strip('/'),
fname))

saved_local_fname = os.path.join(data_path, fname)

# Perform download
logger.info(' '.join(('Attempting to download file for',
date.strftime('%d %B %Y'))))
try:
with requests.get(remote_path) as req:
if req.status_code != 404:
with open(saved_local_fname, 'wb') as open_f:
open_f.write(req.content)
logger.info('Successfully downloaded {:}.'.format(
saved_local_fname))
if zip_method:
get_file(req.content, data_path, fname,
temp_path=temp_dir.name, zip_method=zip_method)
else:
get_file(req.content, data_path, fname)
logger.info(''.join(('Successfully downloaded ',
fname, '.')))
else:
logger.info(' '.join(('File not available for',
date.strftime('%d %B %Y'))))
Expand All @@ -568,6 +578,52 @@ def download(date_array, data_path, tag='', inst_id='', supported_tags=None,
date.strftime('%d %B %Y'))))
# Pause to avoid excessive pings to server
sleep(0.2)

if zip_method:
# Cleanup temporary directory
temp_dir.cleanup()

return


def get_file(remote_file, data_path, fname, temp_path=None, zip_method=None):
"""Retrieve a file, unzipping if necessary.

Parameters
----------
remote_file : file content
File content retireved via requests.
data_path : str
Path to pysat archival directory.
fname : str
Name of file on the remote server.
temp_path : str
Path to temporary directory. (Default=None)
zip_method : str
The method used to zip the file. Supports 'zip' and None.
If None, downloads files directly. (default=None)

"""

if zip_method:
# Use a temporary location.
dl_fname = os.path.join(temp_path, fname)
else:
# Use the pysat data directory.
dl_fname = os.path.join(data_path, fname)

# Download the file to desired destination.
with open(dl_fname, 'wb') as open_f:
open_f.write(remote_file)

# Unzip and move the files from the temporary directory.
if zip_method == 'zip':
with zipfile.ZipFile(dl_fname, 'r') as open_zip:
open_zip.extractall(data_path)

elif zip_method is not None:
logger.warning('{:} is not a recognized zip method'.format(zip_method))
aburrell marked this conversation as resolved.
Show resolved Hide resolved

return


Expand Down
23 changes: 23 additions & 0 deletions pysatNASA/tests/test_methods_cdaweb.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""Unit tests for the cdaweb instrument methods."""

import datetime as dt
import logging
import pandas as pds
import requests
import tempfile

import pytest

Expand Down Expand Up @@ -50,6 +52,27 @@ def test_load_with_empty_file_list(self):
assert meta is None
return

def test_bad_zip_warning_get_files(self, caplog):
"""Test that warning is raised for unsupported zip method."""

# Specifiy small file to get
url = 'https://cdaweb.gsfc.nasa.gov/pub/000_readme.txt'
req = requests.get(url)

# Download to temporary location
temp_dir = tempfile.TemporaryDirectory()

with caplog.at_level(logging.WARNING, logger='pysat'):
cdw.get_file(req.content, '.', 'test.txt', temp_path=temp_dir.name,
zip_method='badzip')
captured = caplog.text

# Check for appropriate warning
warn_msg = "not a recognized zip method"
assert warn_msg in captured

return

@pytest.mark.parametrize("bad_key,bad_val,err_msg",
[("tag", "badval", "inst_id / tag combo unknown."),
("inst_id", "badval",
Expand Down
Loading