From 153fb61136183b459ee3f088a5dff50759793a98 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Wed, 13 Sep 2023 08:22:40 +0100 Subject: [PATCH 1/4] Add index generation system that uses offsets into the WACZ itself. --- wacz/main.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/wacz/main.py b/wacz/main.py index 2704086..063f9a3 100644 --- a/wacz/main.py +++ b/wacz/main.py @@ -1,11 +1,12 @@ from argparse import ArgumentParser, RawTextHelpFormatter from io import BytesIO, StringIO, TextIOWrapper -import os, json, datetime, shutil, zipfile, sys, gzip, pkg_resources +import os, json, datetime, shutil, zipfile, sys, gzip, pkg_resources, zlib from wacz.waczindexer import WACZIndexer from wacz.util import now, WACZ_VERSION, construct_passed_pages_dict from wacz.validate import Validation, OUTDATED_WACZ from wacz.util import validateJSON, get_py_wacz_version from warcio.timeutils import iso_date_to_timestamp +from warcio.bufferedreaders import DecompressingBufferedReader """ WACZ Generator @@ -103,6 +104,10 @@ def main(args=None): help="URL of verify server to verify the signature, if any, in dapackage-digest.json", ) + index = subparsers.add_parser("index", help="generate a WACZ-level CDXJ index") + index.add_argument("-f", "--file", required=True) + index.set_defaults(func=index_wacz) + cmd = parser.parse_args(args=args) if cmd.cmd == "create" and cmd.ts is not None and cmd.url is None: @@ -154,6 +159,51 @@ def validate_wacz(res): return 0 +def index_wacz(res): + + # Open up the ZIP: + with zipfile.ZipFile(res.file) as zf: + + # Determine the WACZ filename/path to use: + wacz_path = os.path.basename(res.file) + # TODO Add option to override prefix + + # Get a look-up table for offsets for each archive file: + archive_offsets = {} + archives_prefix = "archive/" + for zinfo in zf.infolist(): + #print(f'{zinfo.filename}: offset {zinfo.header_offset} compress_type {zinfo.compress_type}') + if zinfo.filename.startswith(archives_prefix): + archive_name = zinfo.filename[len(archives_prefix):] + archive_offsets[archive_name] = zinfo.header_offset + len(zinfo.FileHeader()) + if zinfo.compress_type != 0: + raise Exception("Can't generate WACZ-level index from compressed WARC records! This file does not conform to the WACZ standard!") + + # Stream through the index in the WACZ: + index_file = "indexes/index.cdx.gz" + zinfo = zf.getinfo(index_file) + print(f'{zinfo.filename}: offset {zinfo.header_offset}') + with zf.open(index_file) as f: + reader = DecompressingBufferedReader(f) + while True: + line = reader.readline() + # If we reach the end, end: + if len(line) == 0: + break + # Otherwise, decode the line: + surt, timestamp, json_data_str = line.decode('utf-8').rstrip('\n').split(' ', maxsplit=2) + json_data = json.loads(json_data_str) + # Override the offset to include of file offset in the ZIP + archive_filename = json_data['filename'] + archive_offset = json_data['offset'] + json_data['offset'] = archive_offsets[archive_filename] + int(archive_offset) + #json_data['original_offset'] = archive_offset + # Also override the filename to point at the WACZ + json_data['filename'] = wacz_path + #json_data['original_filename'] = archive_filename + # Output the modified values: + print(f"{surt} {timestamp} {json.dumps(json_data)}") + def create_wacz(res): wacz = zipfile.ZipFile(res.output, "w") From 4125787015837b3249a7982a557339fe447c0cb6 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 14 Sep 2023 11:05:40 +0100 Subject: [PATCH 2/4] Fix formatting with Black. --- wacz/main.py | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/wacz/main.py b/wacz/main.py index 063f9a3..68be7b0 100644 --- a/wacz/main.py +++ b/wacz/main.py @@ -160,10 +160,8 @@ def validate_wacz(res): def index_wacz(res): - # Open up the ZIP: with zipfile.ZipFile(res.file) as zf: - # Determine the WACZ filename/path to use: wacz_path = os.path.basename(res.file) # TODO Add option to override prefix @@ -172,17 +170,21 @@ def index_wacz(res): archive_offsets = {} archives_prefix = "archive/" for zinfo in zf.infolist(): - #print(f'{zinfo.filename}: offset {zinfo.header_offset} compress_type {zinfo.compress_type}') + # print(f'{zinfo.filename}: offset {zinfo.header_offset} compress_type {zinfo.compress_type}') if zinfo.filename.startswith(archives_prefix): - archive_name = zinfo.filename[len(archives_prefix):] - archive_offsets[archive_name] = zinfo.header_offset + len(zinfo.FileHeader()) + archive_name = zinfo.filename[len(archives_prefix) :] + archive_offsets[archive_name] = zinfo.header_offset + len( + zinfo.FileHeader() + ) if zinfo.compress_type != 0: - raise Exception("Can't generate WACZ-level index from compressed WARC records! This file does not conform to the WACZ standard!") + raise Exception( + "Can't generate WACZ-level index from compressed WARC records! This file does not conform to the WACZ standard!" + ) # Stream through the index in the WACZ: index_file = "indexes/index.cdx.gz" zinfo = zf.getinfo(index_file) - print(f'{zinfo.filename}: offset {zinfo.header_offset}') + # print(f"{zinfo.filename}: offset {zinfo.header_offset}") with zf.open(index_file) as f: reader = DecompressingBufferedReader(f) while True: @@ -191,19 +193,24 @@ def index_wacz(res): if len(line) == 0: break # Otherwise, decode the line: - surt, timestamp, json_data_str = line.decode('utf-8').rstrip('\n').split(' ', maxsplit=2) + surt, timestamp, json_data_str = ( + line.decode("utf-8").rstrip("\n").split(" ", maxsplit=2) + ) json_data = json.loads(json_data_str) # Override the offset to include of file offset in the ZIP - archive_filename = json_data['filename'] - archive_offset = json_data['offset'] - json_data['offset'] = archive_offsets[archive_filename] + int(archive_offset) - #json_data['original_offset'] = archive_offset + archive_filename = json_data["filename"] + archive_offset = json_data["offset"] + json_data["offset"] = archive_offsets[archive_filename] + int( + archive_offset + ) + # json_data['original_offset'] = archive_offset # Also override the filename to point at the WACZ - json_data['filename'] = wacz_path - #json_data['original_filename'] = archive_filename + json_data["filename"] = wacz_path + # json_data['original_filename'] = archive_filename # Output the modified values: print(f"{surt} {timestamp} {json.dumps(json_data)}") + def create_wacz(res): wacz = zipfile.ZipFile(res.output, "w") From 35f4a230288005678ebbe4f01cc018c6d2fb2774 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 14 Sep 2023 14:44:52 +0100 Subject: [PATCH 3/4] Allow output and WACZ prefix to be set. --- wacz/main.py | 80 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 31 deletions(-) diff --git a/wacz/main.py b/wacz/main.py index 68be7b0..905cd17 100644 --- a/wacz/main.py +++ b/wacz/main.py @@ -105,7 +105,23 @@ def main(args=None): ) index = subparsers.add_parser("index", help="generate a WACZ-level CDXJ index") - index.add_argument("-f", "--file", required=True) + index.add_argument( + "-f", "--file", required=True, help="The WACZ file to read and index." + ) + index.add_argument( + "-o", + "--output-file", + required=False, + default="-", + help="The CDXJ output file. Defaults to '-' which means to print to STDOUT.", + ) + index.add_argument( + "-p", + "--wacz-prefix", + required=False, + default=None, + help="Prefix to use when referring to the WACZ file from the CDXJ index. e.g. if the prefix is '/disk/path/' and the WACZ is called example.wacz then the CDXJ file will refer to '/disk/path/example.wacz'.", + ) index.set_defaults(func=index_wacz) cmd = parser.parse_args(args=args) @@ -164,13 +180,14 @@ def index_wacz(res): with zipfile.ZipFile(res.file) as zf: # Determine the WACZ filename/path to use: wacz_path = os.path.basename(res.file) - # TODO Add option to override prefix + # Allow users to specify the prefix where the WACZ is stored: + if res.wacz_prefix: + wacz_path = f"{res.wacz_prefix}{wacz_path}" # Get a look-up table for offsets for each archive file: archive_offsets = {} archives_prefix = "archive/" for zinfo in zf.infolist(): - # print(f'{zinfo.filename}: offset {zinfo.header_offset} compress_type {zinfo.compress_type}') if zinfo.filename.startswith(archives_prefix): archive_name = zinfo.filename[len(archives_prefix) :] archive_offsets[archive_name] = zinfo.header_offset + len( @@ -181,34 +198,35 @@ def index_wacz(res): "Can't generate WACZ-level index from compressed WARC records! This file does not conform to the WACZ standard!" ) - # Stream through the index in the WACZ: - index_file = "indexes/index.cdx.gz" - zinfo = zf.getinfo(index_file) - # print(f"{zinfo.filename}: offset {zinfo.header_offset}") - with zf.open(index_file) as f: - reader = DecompressingBufferedReader(f) - while True: - line = reader.readline() - # If we reach the end, end: - if len(line) == 0: - break - # Otherwise, decode the line: - surt, timestamp, json_data_str = ( - line.decode("utf-8").rstrip("\n").split(" ", maxsplit=2) - ) - json_data = json.loads(json_data_str) - # Override the offset to include of file offset in the ZIP - archive_filename = json_data["filename"] - archive_offset = json_data["offset"] - json_data["offset"] = archive_offsets[archive_filename] + int( - archive_offset - ) - # json_data['original_offset'] = archive_offset - # Also override the filename to point at the WACZ - json_data["filename"] = wacz_path - # json_data['original_filename'] = archive_filename - # Output the modified values: - print(f"{surt} {timestamp} {json.dumps(json_data)}") + # Set up the output stream: + with open( + res.output_file, "w" + ) if res.output_file != "-" else sys.stdout as f_out: + # Stream through the index in the WACZ: + index_file = "indexes/index.cdx.gz" + zinfo = zf.getinfo(index_file) + with zf.open(index_file) as f: + reader = DecompressingBufferedReader(f) + while True: + line = reader.readline() + # If we reach the end, end: + if len(line) == 0: + break + # Otherwise, decode the line: + surt, timestamp, json_data_str = ( + line.decode("utf-8").rstrip("\n").split(" ", maxsplit=2) + ) + json_data = json.loads(json_data_str) + # Override the offset to include of file offset in the ZIP + archive_filename = json_data["filename"] + archive_offset = json_data["offset"] + json_data["offset"] = archive_offsets[archive_filename] + int( + archive_offset + ) + # Also override the filename to point at the WACZ + json_data["filename"] = wacz_path + # Output the modified values: + f_out.write(f"{surt} {timestamp} {json.dumps(json_data)}\n") def create_wacz(res): From 2280a5293083b09c23f9f2b112a523e74e4ab310 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 14 Sep 2023 14:58:24 +0100 Subject: [PATCH 4/4] Tweak code ordering for clarity. --- wacz/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wacz/main.py b/wacz/main.py index 905cd17..37b3635 100644 --- a/wacz/main.py +++ b/wacz/main.py @@ -217,14 +217,14 @@ def index_wacz(res): line.decode("utf-8").rstrip("\n").split(" ", maxsplit=2) ) json_data = json.loads(json_data_str) - # Override the offset to include of file offset in the ZIP + # Also override the filename to point at the WACZ: archive_filename = json_data["filename"] + json_data["filename"] = wacz_path + # Override the offset to include of file offset in the ZIP: archive_offset = json_data["offset"] json_data["offset"] = archive_offsets[archive_filename] + int( archive_offset ) - # Also override the filename to point at the WACZ - json_data["filename"] = wacz_path # Output the modified values: f_out.write(f"{surt} {timestamp} {json.dumps(json_data)}\n")