From 153fb61136183b459ee3f088a5dff50759793a98 Mon Sep 17 00:00:00 2001
From: Andrew Jackson <Andrew.Jackson@bl.uk>
Date: Wed, 13 Sep 2023 08:22:40 +0100
Subject: [PATCH 1/4] Add index generation system that uses offsets into the
 WACZ itself.

---
 wacz/main.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/wacz/main.py b/wacz/main.py
index 2704086..063f9a3 100644
--- a/wacz/main.py
+++ b/wacz/main.py
@@ -1,11 +1,12 @@
 from argparse import ArgumentParser, RawTextHelpFormatter
 from io import BytesIO, StringIO, TextIOWrapper
-import os, json, datetime, shutil, zipfile, sys, gzip, pkg_resources
+import os, json, datetime, shutil, zipfile, sys, gzip, pkg_resources, zlib
 from wacz.waczindexer import WACZIndexer
 from wacz.util import now, WACZ_VERSION, construct_passed_pages_dict
 from wacz.validate import Validation, OUTDATED_WACZ
 from wacz.util import validateJSON, get_py_wacz_version
 from warcio.timeutils import iso_date_to_timestamp
+from warcio.bufferedreaders import DecompressingBufferedReader
 
 """
 WACZ Generator
@@ -103,6 +104,10 @@ def main(args=None):
         help="URL of verify server to verify the signature, if any, in dapackage-digest.json",
     )
 
+    index = subparsers.add_parser("index", help="generate a WACZ-level CDXJ index")
+    index.add_argument("-f", "--file", required=True)
+    index.set_defaults(func=index_wacz)
+
     cmd = parser.parse_args(args=args)
 
     if cmd.cmd == "create" and cmd.ts is not None and cmd.url is None:
@@ -154,6 +159,51 @@ def validate_wacz(res):
     return 0
 
 
+def index_wacz(res):
+
+    # Open up the ZIP:
+    with zipfile.ZipFile(res.file) as zf:
+
+        # Determine the WACZ filename/path to use:
+        wacz_path = os.path.basename(res.file)
+        # TODO Add option to override prefix
+
+        # Get a look-up table for offsets for each archive file:
+        archive_offsets = {}
+        archives_prefix = "archive/"
+        for zinfo in zf.infolist():
+            #print(f'{zinfo.filename}: offset {zinfo.header_offset} compress_type {zinfo.compress_type}')
+            if zinfo.filename.startswith(archives_prefix):
+                archive_name = zinfo.filename[len(archives_prefix):]
+                archive_offsets[archive_name] = zinfo.header_offset + len(zinfo.FileHeader())
+                if zinfo.compress_type != 0:
+                    raise Exception("Can't generate WACZ-level index from compressed WARC records! This file does not conform to the WACZ standard!")
+
+        # Stream through the index in the WACZ:
+        index_file = "indexes/index.cdx.gz"
+        zinfo = zf.getinfo(index_file)
+        print(f'{zinfo.filename}: offset {zinfo.header_offset}')
+        with zf.open(index_file) as f:
+            reader = DecompressingBufferedReader(f)
+            while True:
+                line = reader.readline()
+                # If we reach the end, end:
+                if len(line) == 0:
+                    break
+                # Otherwise, decode the line:
+                surt, timestamp, json_data_str = line.decode('utf-8').rstrip('\n').split(' ', maxsplit=2)
+                json_data = json.loads(json_data_str)
+                # Override the offset to include of file offset in the ZIP
+                archive_filename = json_data['filename']
+                archive_offset = json_data['offset']
+                json_data['offset'] = archive_offsets[archive_filename] + int(archive_offset)
+                #json_data['original_offset'] = archive_offset
+                # Also override the filename to point at the WACZ
+                json_data['filename'] = wacz_path
+                #json_data['original_filename'] = archive_filename
+                # Output the modified values:
+                print(f"{surt} {timestamp} {json.dumps(json_data)}")
+
 def create_wacz(res):
     wacz = zipfile.ZipFile(res.output, "w")
 

From 4125787015837b3249a7982a557339fe447c0cb6 Mon Sep 17 00:00:00 2001
From: Andrew Jackson <Andrew.Jackson@bl.uk>
Date: Thu, 14 Sep 2023 11:05:40 +0100
Subject: [PATCH 2/4] Fix formatting with Black.

---
 wacz/main.py | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/wacz/main.py b/wacz/main.py
index 063f9a3..68be7b0 100644
--- a/wacz/main.py
+++ b/wacz/main.py
@@ -160,10 +160,8 @@ def validate_wacz(res):
 
 
 def index_wacz(res):
-
     # Open up the ZIP:
     with zipfile.ZipFile(res.file) as zf:
-
         # Determine the WACZ filename/path to use:
         wacz_path = os.path.basename(res.file)
         # TODO Add option to override prefix
@@ -172,17 +170,21 @@ def index_wacz(res):
         archive_offsets = {}
         archives_prefix = "archive/"
         for zinfo in zf.infolist():
-            #print(f'{zinfo.filename}: offset {zinfo.header_offset} compress_type {zinfo.compress_type}')
+            # print(f'{zinfo.filename}: offset {zinfo.header_offset} compress_type {zinfo.compress_type}')
             if zinfo.filename.startswith(archives_prefix):
-                archive_name = zinfo.filename[len(archives_prefix):]
-                archive_offsets[archive_name] = zinfo.header_offset + len(zinfo.FileHeader())
+                archive_name = zinfo.filename[len(archives_prefix) :]
+                archive_offsets[archive_name] = zinfo.header_offset + len(
+                    zinfo.FileHeader()
+                )
                 if zinfo.compress_type != 0:
-                    raise Exception("Can't generate WACZ-level index from compressed WARC records! This file does not conform to the WACZ standard!")
+                    raise Exception(
+                        "Can't generate WACZ-level index from compressed WARC records! This file does not conform to the WACZ standard!"
+                    )
 
         # Stream through the index in the WACZ:
         index_file = "indexes/index.cdx.gz"
         zinfo = zf.getinfo(index_file)
-        print(f'{zinfo.filename}: offset {zinfo.header_offset}')
+        # print(f"{zinfo.filename}: offset {zinfo.header_offset}")
         with zf.open(index_file) as f:
             reader = DecompressingBufferedReader(f)
             while True:
@@ -191,19 +193,24 @@ def index_wacz(res):
                 if len(line) == 0:
                     break
                 # Otherwise, decode the line:
-                surt, timestamp, json_data_str = line.decode('utf-8').rstrip('\n').split(' ', maxsplit=2)
+                surt, timestamp, json_data_str = (
+                    line.decode("utf-8").rstrip("\n").split(" ", maxsplit=2)
+                )
                 json_data = json.loads(json_data_str)
                 # Override the offset to include of file offset in the ZIP
-                archive_filename = json_data['filename']
-                archive_offset = json_data['offset']
-                json_data['offset'] = archive_offsets[archive_filename] + int(archive_offset)
-                #json_data['original_offset'] = archive_offset
+                archive_filename = json_data["filename"]
+                archive_offset = json_data["offset"]
+                json_data["offset"] = archive_offsets[archive_filename] + int(
+                    archive_offset
+                )
+                # json_data['original_offset'] = archive_offset
                 # Also override the filename to point at the WACZ
-                json_data['filename'] = wacz_path
-                #json_data['original_filename'] = archive_filename
+                json_data["filename"] = wacz_path
+                # json_data['original_filename'] = archive_filename
                 # Output the modified values:
                 print(f"{surt} {timestamp} {json.dumps(json_data)}")
 
+
 def create_wacz(res):
     wacz = zipfile.ZipFile(res.output, "w")
 

From 35f4a230288005678ebbe4f01cc018c6d2fb2774 Mon Sep 17 00:00:00 2001
From: Andrew Jackson <Andrew.Jackson@bl.uk>
Date: Thu, 14 Sep 2023 14:44:52 +0100
Subject: [PATCH 3/4] Allow output and WACZ prefix to be set.

---
 wacz/main.py | 80 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 49 insertions(+), 31 deletions(-)

diff --git a/wacz/main.py b/wacz/main.py
index 68be7b0..905cd17 100644
--- a/wacz/main.py
+++ b/wacz/main.py
@@ -105,7 +105,23 @@ def main(args=None):
     )
 
     index = subparsers.add_parser("index", help="generate a WACZ-level CDXJ index")
-    index.add_argument("-f", "--file", required=True)
+    index.add_argument(
+        "-f", "--file", required=True, help="The WACZ file to read and index."
+    )
+    index.add_argument(
+        "-o",
+        "--output-file",
+        required=False,
+        default="-",
+        help="The CDXJ output file. Defaults to '-' which means to print to STDOUT.",
+    )
+    index.add_argument(
+        "-p",
+        "--wacz-prefix",
+        required=False,
+        default=None,
+        help="Prefix to use when referring to the WACZ file from the CDXJ index. e.g. if the prefix is '/disk/path/' and the WACZ is called example.wacz then the CDXJ file will refer to '/disk/path/example.wacz'.",
+    )
     index.set_defaults(func=index_wacz)
 
     cmd = parser.parse_args(args=args)
@@ -164,13 +180,14 @@ def index_wacz(res):
     with zipfile.ZipFile(res.file) as zf:
         # Determine the WACZ filename/path to use:
         wacz_path = os.path.basename(res.file)
-        # TODO Add option to override prefix
+        # Allow users to specify the prefix where the WACZ is stored:
+        if res.wacz_prefix:
+            wacz_path = f"{res.wacz_prefix}{wacz_path}"
 
         # Get a look-up table for offsets for each archive file:
         archive_offsets = {}
         archives_prefix = "archive/"
         for zinfo in zf.infolist():
-            # print(f'{zinfo.filename}: offset {zinfo.header_offset} compress_type {zinfo.compress_type}')
             if zinfo.filename.startswith(archives_prefix):
                 archive_name = zinfo.filename[len(archives_prefix) :]
                 archive_offsets[archive_name] = zinfo.header_offset + len(
@@ -181,34 +198,35 @@ def index_wacz(res):
                         "Can't generate WACZ-level index from compressed WARC records! This file does not conform to the WACZ standard!"
                     )
 
-        # Stream through the index in the WACZ:
-        index_file = "indexes/index.cdx.gz"
-        zinfo = zf.getinfo(index_file)
-        # print(f"{zinfo.filename}: offset {zinfo.header_offset}")
-        with zf.open(index_file) as f:
-            reader = DecompressingBufferedReader(f)
-            while True:
-                line = reader.readline()
-                # If we reach the end, end:
-                if len(line) == 0:
-                    break
-                # Otherwise, decode the line:
-                surt, timestamp, json_data_str = (
-                    line.decode("utf-8").rstrip("\n").split(" ", maxsplit=2)
-                )
-                json_data = json.loads(json_data_str)
-                # Override the offset to include of file offset in the ZIP
-                archive_filename = json_data["filename"]
-                archive_offset = json_data["offset"]
-                json_data["offset"] = archive_offsets[archive_filename] + int(
-                    archive_offset
-                )
-                # json_data['original_offset'] = archive_offset
-                # Also override the filename to point at the WACZ
-                json_data["filename"] = wacz_path
-                # json_data['original_filename'] = archive_filename
-                # Output the modified values:
-                print(f"{surt} {timestamp} {json.dumps(json_data)}")
+        # Set up the output stream:
+        with open(
+            res.output_file, "w"
+        ) if res.output_file != "-" else sys.stdout as f_out:
+            # Stream through the index in the WACZ:
+            index_file = "indexes/index.cdx.gz"
+            zinfo = zf.getinfo(index_file)
+            with zf.open(index_file) as f:
+                reader = DecompressingBufferedReader(f)
+                while True:
+                    line = reader.readline()
+                    # If we reach the end, end:
+                    if len(line) == 0:
+                        break
+                    # Otherwise, decode the line:
+                    surt, timestamp, json_data_str = (
+                        line.decode("utf-8").rstrip("\n").split(" ", maxsplit=2)
+                    )
+                    json_data = json.loads(json_data_str)
+                    # Override the offset to include of file offset in the ZIP
+                    archive_filename = json_data["filename"]
+                    archive_offset = json_data["offset"]
+                    json_data["offset"] = archive_offsets[archive_filename] + int(
+                        archive_offset
+                    )
+                    # Also override the filename to point at the WACZ
+                    json_data["filename"] = wacz_path
+                    # Output the modified values:
+                    f_out.write(f"{surt} {timestamp} {json.dumps(json_data)}\n")
 
 
 def create_wacz(res):

From 2280a5293083b09c23f9f2b112a523e74e4ab310 Mon Sep 17 00:00:00 2001
From: Andrew Jackson <Andrew.Jackson@bl.uk>
Date: Thu, 14 Sep 2023 14:58:24 +0100
Subject: [PATCH 4/4] Tweak code ordering for clarity.

---
 wacz/main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/wacz/main.py b/wacz/main.py
index 905cd17..37b3635 100644
--- a/wacz/main.py
+++ b/wacz/main.py
@@ -217,14 +217,14 @@ def index_wacz(res):
                         line.decode("utf-8").rstrip("\n").split(" ", maxsplit=2)
                     )
                     json_data = json.loads(json_data_str)
-                    # Override the offset to include of file offset in the ZIP
+                    # Also override the filename to point at the WACZ:
                     archive_filename = json_data["filename"]
+                    json_data["filename"] = wacz_path
+                    # Override the offset to include of file offset in the ZIP:
                     archive_offset = json_data["offset"]
                     json_data["offset"] = archive_offsets[archive_filename] + int(
                         archive_offset
                     )
-                    # Also override the filename to point at the WACZ
-                    json_data["filename"] = wacz_path
                     # Output the modified values:
                     f_out.write(f"{surt} {timestamp} {json.dumps(json_data)}\n")