Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support bagging to a destination other than the source #92

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 169 additions & 50 deletions bagit.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from datetime import date
from functools import partial
from os.path import abspath, isdir, isfile, join
import shutil

try:
from urllib.parse import urlparse
Expand Down Expand Up @@ -132,7 +133,9 @@ def find_locale_dir():
UNICODE_BYTE_ORDER_MARK = u'\uFEFF'


def make_bag(bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding='utf-8'):
def make_bag(bag_dir,
bag_info=None, processes=1, checksums=None, checksum=None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer not to have formatting changes in this PR or to follow PEP-8 if we do. In this case, the existing code has been using the aligned indent style.

encoding='utf-8', dest_dir=None):
"""
Convert a given directory into a bag. You can pass in arbitrary
key/value pairs to put into the bag-info.txt metadata file as
Expand All @@ -148,28 +151,57 @@ def make_bag(bag_dir, bag_info=None, processes=1, checksums=None, checksum=None,
checksums = DEFAULT_CHECKSUMS

bag_dir = os.path.abspath(bag_dir)
LOGGER.info(_("Creating bag for directory %s"), bag_dir)
if dest_dir:
LOGGER.info(_("Creating bag for directory %s"), dest_dir)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer a single logging call rather than two near-duplicate lines. In this case, I think it might be easier to follow if we set dest_dir using bag_dir when called with dest_dir=None so most of the code could assume that dest_dir is what you use when assembling paths

else:
LOGGER.info(_("Creating bag for directory %s"), bag_dir)

if not os.path.isdir(bag_dir):
LOGGER.error(_("Bag directory %s does not exist"), bag_dir)
raise RuntimeError(_("Bag directory %s does not exist") % bag_dir)

# FIXME: we should do the permissions checks before changing directories
old_dir = os.path.abspath(os.path.curdir)
if dest_dir is not None:
dest_dir = os.path.abspath(dest_dir)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer to normalize dest_dir earlier in the function

if os.path.exists(dest_dir):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This block of code is somewhat complicated. As a minor point, OSError or EnvironmentError is probably more appropriate than RuntimeError but I'm leaning towards simplifying the entire block to:

if not os.path.isdir(dest_dir):
    os.makedirs(dest_dir)

That will raise OSError if the path already exists but has a component which is not a directory.

if not os.path.isdir(dest_dir):
# Destination directory is a file or something else we can't
# work with.
raise RuntimeError(_("Destination %s exists but is not a directory") % dest_dir)
else:
# Destination directory looks okay so far.
pass
else:
dest_par = os.path.abspath(os.path.join(dest_dir, os.pardir))
if os.path.isdir(dest_par):
try:
os.mkdir(dest_dir)
except:
raise RuntimeError(_("Could not create destination directory %s") % dest_dir)
else:
raise RuntimeError(
_("Destination parent directory %s is not a directory") % dest_par
)

try:
# TODO: These two checks are currently redundant since an unreadable directory will also
# often be unwritable, and this code will require review when we add the option to
# bag to a destination other than the source. It would be nice if we could avoid
# walking the directory tree more than once even if most filesystems will cache it

unbaggable = _can_bag(bag_dir)
if dest_dir is None:
unbaggable = _can_bag(bag_dir)
else:
unbaggable = _can_bag_to(bag_dir, dest_dir)

if unbaggable:
LOGGER.error(_("Unable to write to the following directories and files:\n%s"), unbaggable)
raise BagError(_("Missing permissions to move all files and directories"))

unreadable_dirs, unreadable_files = _can_read(bag_dir)
if dest_dir is not None:
ud, uf = _can_read(dest_dir)
unreadable_dirs += ud
unreadable_files += uf

if unreadable_dirs or unreadable_files:
if unreadable_dirs:
Expand All @@ -182,34 +214,73 @@ def make_bag(bag_dir, bag_info=None, processes=1, checksums=None, checksum=None,
else:
LOGGER.info(_("Creating data directory"))

# FIXME: if we calculate full paths we won't need to deal with changing directories
os.chdir(bag_dir)
cwd = os.getcwd()
temp_data = tempfile.mkdtemp(dir=cwd)
data_subdir = None
if dest_dir:
temp_data = tempfile.mkdtemp(dir=dest_dir)
data_dir = os.path.join(dest_dir, 'data')
data_subdir = os.path.relpath(
bag_dir, os.path.join(bag_dir, os.pardir)
)
if os.path.abspath(os.path.join(bag_dir, data_subdir)) == bag_dir:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this could be os.path.samefile(os.path.join(bag_dir, data_subdir), bag_dir)

raise RuntimeError(
_("Could not get parent of source directory %s") % bag_dir
)
else:
temp_data = tempfile.mkdtemp(dir=bag_dir)
data_dir = os.path.join(bag_dir, 'data')

for f in os.listdir('.'):
if os.path.abspath(f) == temp_data:
for f in os.listdir(bag_dir):
if os.path.join(bag_dir, f) == temp_data:
continue
new_f = os.path.join(temp_data, f)
LOGGER.info(_('Moving %(source)s to %(destination)s'),
{'source': f, 'destination': new_f})
os.rename(f, new_f)
if dest_dir:
LOGGER.info(_('Copying %(source)s to %(destination)s'),
{'source': os.path.join(bag_dir, f), 'destination': new_f})
if os.path.isdir(os.path.join(bag_dir, f)):
shutil.copytree(os.path.join(bag_dir, f), new_f)
else:
shutil.copy(os.path.join(bag_dir, f), new_f)
else:
LOGGER.info(_('Moving %(source)s to %(destination)s'),
{'source': os.path.join(bag_dir, f), 'destination': new_f})
os.rename(os.path.join(bag_dir, f), new_f)

LOGGER.info(_('Moving %(source)s to %(destination)s'),
{'source': temp_data, 'destination': 'data'})
os.rename(temp_data, 'data')
if data_subdir:
LOGGER.info(_('Moving %(source)s to %(destination)s'),
{'source': temp_data,
'destination': os.path.join(data_dir, data_subdir)})
if not os.path.exists(data_dir):
os.mkdir(data_dir)
elif os.path.exists(os.path.join(data_dir, data_subdir)):
shutil.rmtree(os.path.join(data_dir, data_subdir))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems risky at first read – shouldn't we just throw an error if the data directory already exists?

os.rename(temp_data, os.path.join(data_dir, data_subdir))
else:
LOGGER.info(_('Moving %(source)s to %(destination)s'),
{'source': temp_data, 'destination': data_dir})
os.rename(temp_data, data_dir)

# permissions for the payload directory should match those of the
# original directory
os.chmod('data', os.stat(cwd).st_mode)
os.chmod(data_dir, os.stat(bag_dir).st_mode)

if dest_dir:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering whether this should be one call with src_dir either being bag_dir or None for clarity

total_bytes, total_files = make_manifests(
data_dir, processes, algorithms=checksums, encoding=encoding, src_dir=bag_dir
)
else:
total_bytes, total_files = make_manifests(
data_dir, processes, algorithms=checksums, encoding=encoding
)

total_bytes, total_files = make_manifests('data', processes, algorithms=checksums,
encoding=encoding)

LOGGER.info(_("Creating bagit.txt"))
txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n"""
with open_text_file('bagit.txt', 'w') as bagit_file:
bagit_file.write(txt)
if dest_dir:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This entire section is where I was thinking it might be cleaner to always have dest_dir defined since I'm not happy about so many if statements in a row

with open_text_file(os.path.join(dest_dir, 'bagit.txt'), 'w') as bagit_file:
bagit_file.write(txt)
else:
with open_text_file(os.path.join(bag_dir, 'bagit.txt'), 'w') as bagit_file:
bagit_file.write(txt)

LOGGER.info(_("Creating bag-info.txt"))
if bag_info is None:
Expand All @@ -222,17 +293,30 @@ def make_bag(bag_dir, bag_info=None, processes=1, checksums=None, checksum=None,
bag_info['Bag-Software-Agent'] = 'bagit.py v%s <%s>' % (VERSION, PROJECT_URL)

bag_info['Payload-Oxum'] = "%s.%s" % (total_bytes, total_files)
_make_tag_file('bag-info.txt', bag_info)
if dest_dir:
_make_tag_file(os.path.join(dest_dir, 'bag-info.txt'), bag_info)
else:
_make_tag_file(os.path.join(bag_dir, 'bag-info.txt'), bag_info)

for c in checksums:
_make_tagmanifest_file(c, bag_dir, encoding='utf-8')
if dest_dir:
for c in checksums:
_make_tagmanifest_file(c, dest_dir, encoding='utf-8')
else:
for c in checksums:
_make_tagmanifest_file(c, bag_dir, encoding='utf-8')
except Exception:
LOGGER.exception(_("An error occurred creating a bag in %s"), bag_dir)
if dest_dir:
LOGGER.exception(_("An error occurred creating a bag in %s"), dest_dir)
else:
LOGGER.exception(_("An error occurred creating a bag in %s"), bag_dir)
raise
finally:
os.chdir(old_dir)

return Bag(bag_dir)
if dest_dir:
bag = Bag(dest_dir)
else:
bag = Bag(bag_dir)

return bag


class Bag(object):
Expand Down Expand Up @@ -1071,11 +1155,20 @@ def _make_tag_file(bag_info_path, bag_info):
f.write("%s: %s\n" % (h, txt))


def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding='utf-8'):
def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS,
encoding='utf-8', src_dir=None):
LOGGER.info(_('Using %(process_count)d processes to generate manifests: %(algorithms)s'),
{'process_count': processes, 'algorithms': ', '.join(algorithms)})

manifest_line_generator = partial(generate_manifest_lines, algorithms=algorithms)
if src_dir:
manifest_line_generator = partial(
generate_manifest_lines, algorithms=algorithms,
src_dir=src_dir, data_dir=data_dir
)
else:
manifest_line_generator = partial(
generate_manifest_lines, algorithms=algorithms
)

if processes > 1:
pool = multiprocessing.Pool(processes=processes)
Expand All @@ -1095,12 +1188,13 @@ def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding='
# below to catch failures in the hashing process:
num_files = defaultdict(lambda: 0)
total_bytes = defaultdict(lambda: 0)


mani_dir = os.path.abspath(os.path.join(data_dir, os.pardir))
for algorithm, values in manifest_data.items():
manifest_filename = 'manifest-%s.txt' % algorithm

manifest_filename = os.path.join(mani_dir, 'manifest-%s.txt' % algorithm)
with open_text_file(manifest_filename, 'w', encoding=encoding) as manifest:
for digest, filename, byte_count in values:
filename = os.path.relpath(filename, mani_dir)
manifest.write("%s %s\n" % (digest, _encode_filename(filename)))
num_files[algorithm] += 1
total_bytes[algorithm] += byte_count
Expand Down Expand Up @@ -1144,7 +1238,7 @@ def _make_tagmanifest_file(alg, bag_dir, encoding='utf-8'):
def _find_tag_files(bag_dir):
for dir in os.listdir(bag_dir):
if dir != 'data':
if os.path.isfile(dir) and not dir.startswith('tagmanifest-'):
if os.path.isfile(os.path.join(bag_dir, dir)) and not dir.startswith('tagmanifest-'):
yield dir
for dir_name, _, filenames in os.walk(dir):
for filename in filenames:
Expand Down Expand Up @@ -1172,17 +1266,19 @@ def _walk(data_dir):

def _can_bag(test_dir):
"""Scan the provided directory for files which cannot be bagged due to insufficient permissions"""
unbaggable = []
return _can_bag_to(test_dir, test_dir)

if not os.access(test_dir, os.R_OK):
# We cannot continue without permission to read the source directory
unbaggable.append(test_dir)

def _can_bag_to(src_dir, dest_dir):
unbaggable = []
if not os.access(src_dir, os.R_OK):
unbaggable.append(src_dir)
return unbaggable

if not os.access(test_dir, os.W_OK):
unbaggable.append(test_dir)
if not os.access(dest_dir, os.W_OK):
unbaggable.append(dest_dir)

for dirpath, dirnames, filenames in os.walk(test_dir):
for dirpath, dirnames, filenames in os.walk(dest_dir):
for directory in dirnames:
full_path = os.path.join(dirpath, directory)
if not os.access(full_path, os.W_OK):
Expand Down Expand Up @@ -1213,16 +1309,29 @@ def _can_read(test_dir):
return (tuple(unreadable_dirs), tuple(unreadable_files))


def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS):
LOGGER.info(_("Generating manifest lines for file %s"), filename)
def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS, src_dir=None, data_dir=None):

# For performance we'll read the file only once and pass it block
# by block to every requested hash algorithm:
hashers = get_hashers(algorithms)

total_bytes = 0

with open(filename, 'rb') as f:
inpath = filename
if src_dir:
if data_dir:
src_par = os.path.abspath(os.path.join(src_dir, os.pardir))
inpath = os.path.relpath(filename, data_dir)
inpath = os.path.join(src_par, inpath)
if not os.path.exists(inpath):
inpath = filename
else:
raise RuntimeError(_("Data directory required"))
LOGGER.info(_("Generating manifest lines for file %s (source=%s)"), filename, inpath)
else:
LOGGER.info(_("Generating manifest lines for file %s"), filename)

with open(inpath, 'rb') as f:
while True:
block = f.read(HASH_BLOCK_SIZE)

Expand Down Expand Up @@ -1297,6 +1406,8 @@ def _make_parser():
help=_('Modify --validate behaviour to only test whether the bag directory'
' has the number of files and total size specified in Payload-Oxum'
' without performing checksum validation to detect corruption.'))
parser.add_argument('--destination', type=str, dest='dest_dir', default=None,
help=_('Create bag in destination directory rather than in place.'))

checksum_args = parser.add_argument_group(
_('Checksum Algorithms'),
Expand All @@ -1318,7 +1429,9 @@ def _make_parser():
parser.add_argument('directory', nargs='+',
help=_('Directory which will be converted into a bag in place'
' by moving any existing files into the BagIt structure'
' and creating the manifests and other metadata.'))
' and creating the manifests and other metadata.'
' If the destination argument is present, these directories'
' are treated as sources and left as they are.'))

return parser

Expand Down Expand Up @@ -1376,11 +1489,17 @@ def main():
try:
make_bag(bag_dir, bag_info=parser.bag_info,
processes=args.processes,
checksums=args.checksums)
checksums=args.checksums,
dest_dir=args.dest_dir)
except Exception as exc:
LOGGER.error(_("Failed to create bag in %(bag_directory)s: %(error)s"),
{'bag_directory': bag_dir, 'error': exc},
exc_info=True)
if args.dest_dir:
LOGGER.error(_("Failed to create bag in %(bag_directory)s: %(error)s"),
{'bag_directory': args.dest_dir, 'error': exc},
exc_info=True)
else:
LOGGER.error(_("Failed to create bag in %(bag_directory)s: %(error)s"),
{'bag_directory': bag_dir, 'error': exc},
exc_info=True)
rc = 1

sys.exit(rc)
Expand Down
Loading