Skip to content

Commit

Permalink
Minimal implementation of fetching entries of fetch.txt, LibraryOfCon…
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Dec 10, 2018
1 parent fb6c7b1 commit dae7b40
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 3 deletions.
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,10 @@ bench-data
build
dist
MANIFEST
bagit.egg-info
.idea
test.log
*.egg-info
.eggs
*.egg
.tox
locale/**/*.mo
42 changes: 40 additions & 2 deletions bagit.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import signal
import sys
import tempfile
import urllib
import unicodedata
import warnings
from collections import defaultdict
Expand All @@ -23,9 +24,12 @@

from pkg_resources import DistributionNotFound, get_distribution

try:
# pylint: disable=no-name-in-module, import-error, wrong-import-position
if sys.version_info >= (3,):
from urllib.parse import urlparse
except ImportError:
from urllib.request import urlopen, FancyURLopener
else:
from urllib import urlopen, FancyURLopener
from urlparse import urlparse


Expand Down Expand Up @@ -582,6 +586,37 @@ def files_to_be_fetched(self):
for url, file_size, filename in self.fetch_entries():
yield filename

def fetch_files_to_be_fetched(self):
"""
Fetches files from the fetch.txt
"""
urllib._urlopener = BagFetcherURLOpener # pylint: disable=protected-access
for url, expected_size, filename in self.fetch_entries():
expected_size = int(expected_size) # FIXME should be int in the first place
if filename in self.payload_files():
LOGGER.info(_("File already fetched: %s"), filename)
continue
resp = urlopen(url)
headers = resp.info()
if "content-length" not in headers:
LOGGER.warning(_("Server sent no content-length for <%s>"), url)
else:
content_length = int(headers['content-length'])
if content_length != expected_size:
raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length))
with open(join(self.path, filename), 'wb') as out:
read = 0
while True:
block = resp.read(1024 * 8)
if not block:
break
read += len(block)
out.write(block)
if read != expected_size:
raise BagError(_("Inconsistent size of %s: Expected %s but received %s") % (filename, expected_size, read))
LOGGER.info(_("Fetched %s from %s"), filename, url)


def has_oxum(self):
return "Payload-Oxum" in self.info

Expand Down Expand Up @@ -767,6 +802,7 @@ def validate_fetch(self):
# well formed:
parsed_url = urlparse(url)

# ensure url is a remote URL, not file://
if not all((parsed_url.scheme, parsed_url.netloc)):
raise BagError(_("Malformed URL in fetch.txt: %s") % url)

Expand Down Expand Up @@ -937,6 +973,8 @@ def _path_is_dangerous(self, path):
common = os.path.commonprefix((bag_path, real_path))
return not (common == bag_path)

class BagFetcherURLOpener(FancyURLopener):
version = "bagit.py/%s (Python/%s)" % (VERSION, sys.version)

class BagError(Exception):
pass
Expand Down
27 changes: 27 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1099,6 +1099,33 @@ def test_fetch_malformed_url(self):

self.assertEqual(expected_msg, str(cm.exception))

# FIXME: Won't work since file:// URLs are rejected
# def test_fetching_payload_file(self):
# with open(j(self.tmpdir, "mock_data"), "w") as mock_data:
# print("Lorem ipsum dolor sit", file=mock_data)
# with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
# print("file://%s 21 data/mock_data" % j(self.tmpdir, "mock_data"), file=fetch_txt)
# self.bag.save(manifests=True)
# self.bag.validate_fetch()

def test_fetching_payload_file(self):
test_payload = 'loc/2478433644_2839c5e8b8_o_d.jpg'
with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
print("https://github.com/LibraryOfCongress/bagit-python/raw/master/test-data/%s %s data/%s" % (
test_payload, 139367, test_payload), file=fetch_txt)
self.bag.save(manifests=True)
# should be valid
self.bag.validate()
# now delete the payload, should be invalid
os.unlink(j(self.tmpdir, "data", test_payload))
self.assertEqual(len(self.bag.compare_fetch_with_fs()), 1, '1 file to fetch')
with self.assertRaises(bagit.BagError):
self.bag.validate()
# re-fetch it
self.bag.fetch_files_to_be_fetched()
# should be valid again
self.bag.validate()
self.assertEqual(len(self.bag.compare_fetch_with_fs()), 0, 'complete')

class TestUtils(unittest.TestCase):
def setUp(self):
Expand Down

0 comments on commit dae7b40

Please sign in to comment.