From 3c1416131505b2b05fafc2f47bd901d34aa1eba4 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 27 Nov 2018 15:21:41 +0100 Subject: [PATCH] Minimal implementation of fetching entries of fetch.txt, #118 --- .gitignore | 7 ++++++- bagit.py | 42 ++++++++++++++++++++++++++++++++++++++++-- test.py | 27 +++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 1f4b930..a3050ee 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,10 @@ bench-data build dist MANIFEST -bagit.egg-info .idea +test.log +*.egg-info +.eggs +*.egg +.tox +locale/**/*.mo diff --git a/bagit.py b/bagit.py index a821973..f10e124 100755 --- a/bagit.py +++ b/bagit.py @@ -14,6 +14,7 @@ import signal import sys import tempfile +import urllib import unicodedata import warnings from collections import defaultdict @@ -23,9 +24,12 @@ from pkg_resources import DistributionNotFound, get_distribution -try: +# pylint: disable=no-name-in-module, import-error, wrong-import-position +if sys.version_info >= (3,): from urllib.parse import urlparse -except ImportError: + from urllib.request import urlopen, FancyURLopener +else: + from urllib import urlopen, FancyURLopener from urlparse import urlparse @@ -582,6 +586,37 @@ def files_to_be_fetched(self): for url, file_size, filename in self.fetch_entries(): yield filename + def fetch_files_to_be_fetched(self): + """ + Fetches files from the fetch.txt + """ + urllib._urlopener = BagFetcherURLOpener # pylint: disable=protected-access + for url, expected_size, filename in self.fetch_entries(): + expected_size = int(expected_size) # FIXME should be int in the first place + if filename in self.payload_files(): + LOGGER.info(_("File already fetched: %s"), filename) + continue + resp = urlopen(url) + headers = resp.info() + if "content-length" not in headers: + LOGGER.warning(_("Server sent no content-length for <%s>"), url) + else: + content_length = int(headers['content-length']) + if content_length != expected_size: + raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length)) + with open(join(self.path, filename), 'wb') as out: + read = 0 + while True: + block = resp.read(1024 * 8) + if not block: + break + read += len(block) + out.write(block) + if read != expected_size: + raise BagError(_("Inconsistent size of %s: Expected %s but received %s") % (filename, expected_size, read)) + LOGGER.info(_("Fetched %s from %s"), filename, url) + + def has_oxum(self): return "Payload-Oxum" in self.info @@ -767,6 +802,7 @@ def validate_fetch(self): # well formed: parsed_url = urlparse(url) + # ensure url is a remote URL, not file:// if not all((parsed_url.scheme, parsed_url.netloc)): raise BagError(_("Malformed URL in fetch.txt: %s") % url) @@ -937,6 +973,8 @@ def _path_is_dangerous(self, path): common = os.path.commonprefix((bag_path, real_path)) return not (common == bag_path) +class BagFetcherURLOpener(FancyURLopener): + version = "bagit.py/%s (Python/%s)" % (VERSION, sys.version) class BagError(Exception): pass diff --git a/test.py b/test.py index eab3d95..8dd21bc 100644 --- a/test.py +++ b/test.py @@ -1099,6 +1099,33 @@ def test_fetch_malformed_url(self): self.assertEqual(expected_msg, str(cm.exception)) + # FIXME: Won't work since file:// URLs are rejected + # def test_fetching_payload_file(self): + # with open(j(self.tmpdir, "mock_data"), "w") as mock_data: + # print("Lorem ipsum dolor sit", file=mock_data) + # with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: + # print("file://%s 21 data/mock_data" % j(self.tmpdir, "mock_data"), file=fetch_txt) + # self.bag.save(manifests=True) + # self.bag.validate_fetch() + + def test_fetching_payload_file(self): + test_payload = 'loc/2478433644_2839c5e8b8_o_d.jpg' + with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: + print("https://github.com/LibraryOfCongress/bagit-python/raw/master/test-data/%s %s data/%s" % ( + test_payload, 139367, test_payload), file=fetch_txt) + self.bag.save(manifests=True) + # should be valid + self.bag.validate() + # now delete the payload, should be invalid + os.unlink(j(self.tmpdir, "data", test_payload)) + self.assertEqual(len(self.bag.compare_fetch_with_fs()), 1, '1 file to fetch') + with self.assertRaises(bagit.BagError): + self.bag.validate() + # re-fetch it + self.bag.fetch_files_to_be_fetched() + # should be valid again + self.bag.validate() + self.assertEqual(len(self.bag.compare_fetch_with_fs()), 0, 'complete') class TestUtils(unittest.TestCase): def setUp(self):