diff --git a/README.md b/README.md index d6f9e04..6de1baf 100644 --- a/README.md +++ b/README.md @@ -71,12 +71,20 @@ Overrides the pages index generation with the passed jsonl pages. wacz create tests/fixtures/example-collection.warc -p passed_pages.jsonl ``` -### --pages-file +### -e --extra-pages -Overrides the pages index generation by copying existing pages.jsonl and/or extraPages.jsonl directly into the WACZ. Incompatible with --detect-pages and -p/--pages options. +Overrides the extra pages index generation with the passed extra jsonl pages. ``` -wacz create tests/fixtures/example-collection.warc --pages-file pages/pages.jsonl --pages-file pages/extraPages.jsonl +wacz create tests/fixtures/example-collection.warc -p passed_pages.jsonl -e extra_pages.jsonl +``` + +### -c --copy-pages + +Overrides the behavior of --pages and --extra-pages options to copy existing pages.jsonl and/or extraPages.jsonl as-is directly into the WACZ rather than parsing their contents. + +``` +wacz create tests/fixtures/example-collection.warc --pages pages/pages.jsonl --extra-pages pages/extraPages.jsonl --copy-pages ``` ### -t --text diff --git a/tests/fixtures/.gitignore b/tests/fixtures/.gitignore index 40cd956..720e864 100644 --- a/tests/fixtures/.gitignore +++ b/tests/fixtures/.gitignore @@ -5,3 +5,4 @@ !*.warc.gz !*.wacz !*.jsonl +!pages/* diff --git a/tests/fixtures/pages/extraPages.jsonl b/tests/fixtures/pages/extraPages.jsonl new file mode 100644 index 0000000..f0c15cc --- /dev/null +++ b/tests/fixtures/pages/extraPages.jsonl @@ -0,0 +1,4 @@ +{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"} +{"id": "e33b4ca5-ce1d-46b2-83ea-405c43b949c5", "url": "https://webrecorder.net/tools", "title": "Webrecorder | Tools", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:22Z"} +{"id": "d026299c-3e37-4473-bcb4-742bc005b25d", "url": "https://webrecorder.net/blog", "title": "Webrecorder | Blog", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} +{"id": "726e4e11-abb5-447d-b0be-61c4de7bb4b1", "url": "https://webrecorder.net/community", "title": "Webrecorder | Community", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} diff --git a/tests/fixtures/pages/invalid.jsonl b/tests/fixtures/pages/invalid.jsonl new file mode 100644 index 0000000..89930b9 --- /dev/null +++ b/tests/fixtures/pages/invalid.jsonl @@ -0,0 +1,2 @@ +{id": "extra-pages", "title": "Extra Pages"} +{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": null, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} diff --git a/tests/fixtures/pages/invalid.txt b/tests/fixtures/pages/invalid.txt new file mode 100644 index 0000000..f1fe4c4 --- /dev/null +++ b/tests/fixtures/pages/invalid.txt @@ -0,0 +1 @@ +Not a JSONL file diff --git a/tests/fixtures/pages/pages.jsonl b/tests/fixtures/pages/pages.jsonl new file mode 100644 index 0000000..ffee2c7 --- /dev/null +++ b/tests/fixtures/pages/pages.jsonl @@ -0,0 +1,2 @@ +{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"} +{"id": "3e01410a-e0a8-4b6f-8a6a-fca6302d9916", "url": "https://webrecorder.net/", "title": "Webrecorder", "loadState": 4, "status": 200, "seed": true, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:17Z"} diff --git a/tests/test_create_wacz.py b/tests/test_create_wacz.py index 7095c03..a0e9a6c 100644 --- a/tests/test_create_wacz.py +++ b/tests/test_create_wacz.py @@ -1,19 +1,12 @@ import unittest, os, zipfile, sys, gzip, json, tempfile from wacz.main import main, now from unittest.mock import patch -from wacz.util import hash_stream +from wacz.util import hash_file from frictionless import validate, Report TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures") -def hash_file(type_, filename): - with open(filename, "rb") as fh: - size_, hash_ = hash_stream(type_, fh) - - return hash_ - - class TestWaczFormat(unittest.TestCase): def find_resource(self, resource_list, filename): for file in resource_list: diff --git a/tests/test_optional_flags_wacz.py b/tests/test_optional_flags_wacz.py index c1d0500..046f631 100644 --- a/tests/test_optional_flags_wacz.py +++ b/tests/test_optional_flags_wacz.py @@ -3,10 +3,12 @@ import os import zipfile, json, gzip from wacz.main import main, now +from wacz.util import hash_file from unittest.mock import patch import jsonlines TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures") +PAGES_DIR = os.path.join(TEST_DIR, "pages") class TestWaczFormat(unittest.TestCase): @@ -35,6 +37,95 @@ def test_warc_with_invalid_passed_pages(self): 0, ) + def test_invalid_passed_pages_copy_pages(self): + """If a user passes an invalid pages.jsonl file using --page --copy-pages we should return an error""" + with tempfile.TemporaryDirectory() as tmpdir: + self.assertEqual( + main( + [ + "create", + "-f", + os.path.join(TEST_DIR, "example-collection.warc"), + "-o", + os.path.join( + tmpdir, "example-collection-invalid-copy-pages.wacz" + ), + "-p", + os.path.join(PAGES_DIR, "invalid.jsonl"), + "--copy-pages", + ] + ), + 1, + ) + + self.assertEqual( + main( + [ + "create", + "-f", + os.path.join(TEST_DIR, "example-collection.warc"), + "-o", + os.path.join( + tmpdir, "example-collection-invalid-copy-pages-txt.wacz" + ), + "-p", + os.path.join(PAGES_DIR, "invalid.txt"), + "--copy-pages", + ] + ), + 1, + ) + + def test_invalid_passed_extra_pages_copy_pages(self): + """If a user passes an invalid extarPages.jsonl file using -e --copy-pages we still create WACZ without extra pages""" + with tempfile.TemporaryDirectory() as tmpdir: + self.assertEqual( + main( + [ + "create", + "-f", + os.path.join(TEST_DIR, "example-collection.warc"), + "-o", + os.path.join( + tmpdir, "example-collection-invalid-copy-extra-pages.wacz" + ), + "-p", + os.path.join(PAGES_DIR, "pages.jsonl"), + "-e", + os.path.join(PAGES_DIR, "invalid.txt"), + "--copy-pages", + ] + ), + 0, + ) + + with zipfile.ZipFile( + os.path.join( + tmpdir, "example-collection-invalid-copy-extra-pages.wacz" + ), + "r", + ) as zip_ref: + zip_ref.extractall(os.path.join(tmpdir, "wacz_no_extra_pages")) + zip_ref.close() + + self.assertEqual( + main( + [ + "validate", + "-f", + os.path.join( + tmpdir, "example-collection-invalid-copy-extra-pages.wacz" + ), + ] + ), + 0, + ) + + self.assertFalse( + "extraPages.jsonl" + in os.listdir(os.path.join(tmpdir, "wacz_no_extra_pages/pages/")) + ) + @patch("wacz.main.now") def test_warc_with_pages_flag(self, mock_now): """When passing the pages flag with a valid pages.jsonl file a pages/pages.jsonl file should be created""" @@ -95,6 +186,68 @@ def test_warc_with_pages_flag(self, mock_now): self.assertTrue("url" in obj.keys()) self.assertTrue(obj["url"].encode() in cdx_content) + @patch("wacz.main.now") + def test_warc_with_copy_pages(self, mock_now): + """When passing the pages and extra-pages flags with copy-pages, the files should end up in the WACZ exactly as-is""" + mock_now.return_value = (2020, 10, 7, 22, 29, 10) + + with tempfile.TemporaryDirectory() as tmpdir: + self.assertEqual( + main( + [ + "create", + "-f", + os.path.join(TEST_DIR, "example-collection.warc"), + "-o", + os.path.join(tmpdir, "example-collection-copy-pages.wacz"), + "-p", + os.path.join(PAGES_DIR, "pages.jsonl"), + "-e", + os.path.join(PAGES_DIR, "extraPages.jsonl"), + "--copy-pages", + ] + ), + 0, + ) + + with zipfile.ZipFile( + os.path.join(tmpdir, "example-collection-copy-pages.wacz"), "r" + ) as zip_ref: + zip_ref.extractall(os.path.join(tmpdir, "unzipped_copy_pages")) + zip_ref.close() + + self.assertEqual( + main( + [ + "validate", + "-f", + os.path.join(tmpdir, "example-collection-copy-pages.wacz"), + ] + ), + 0, + ) + + wacz_pages = os.path.join(tmpdir, "unzipped_copy_pages/pages/pages.jsonl") + wacz_extra_pages = os.path.join( + tmpdir, "unzipped_copy_pages/pages/extraPages.jsonl" + ) + + self.assertTrue( + "pages.jsonl" + in os.listdir(os.path.join(tmpdir, "unzipped_copy_pages/pages/")) + ) + self.assertTrue( + "extraPages.jsonl" + in os.listdir(os.path.join(tmpdir, "unzipped_copy_pages/pages/")) + ) + + self.assertEqual( + hash_file(wacz_pages, os.path.join(PAGES_DIR, "pages.json")) + ) + self.assertEqual( + hash_file(wacz_extra_pages, os.path.join(PAGES_DIR, "extraPages.json")) + ) + @patch("wacz.main.now") def test_warc_with_detect_pages_flag(self, mock_now): """When passing the text index flag pages/pages.jsonl should be generated.""" diff --git a/wacz/main.py b/wacz/main.py index e913ae0..50db50b 100644 --- a/wacz/main.py +++ b/wacz/main.py @@ -4,7 +4,7 @@ from wacz.waczindexer import WACZIndexer from wacz.util import now, WACZ_VERSION, construct_passed_pages_dict from wacz.validate import Validation, OUTDATED_WACZ -from wacz.util import validateJSON, get_py_wacz_version +from wacz.util import validateJSON, get_py_wacz_version, validate_pages_jsonl_file from warcio.timeutils import iso_date_to_timestamp """ @@ -60,9 +60,10 @@ def main(args=None): ) create.add_argument( - "--pages-file", - help="Overrides the pages generation by copying files to WACZ without parsing", - nargs="+", + "-c", + "--copy-pages", + help="Overrides the pages/extra-pages options by copying files to WACZ without parsing", + action="store_true", ) create.add_argument( @@ -114,18 +115,9 @@ def main(args=None): if cmd.cmd == "create" and cmd.ts is not None and cmd.url is None: parser.error("--url must be specified when --ts is passed") - if ( - cmd.cmd == "create" - and cmd.detect_pages is not False - and (cmd.pages is not None or cmd.pages_file is not None) - ): + if cmd.cmd == "create" and cmd.detect_pages is not False and cmd.pages is not None: parser.error( - "--pages/--pages-file and --detect-pages can't be set at the same time they cancel each other out." - ) - - if cmd.cmd == "create" and cmd.pages is not None and cmd.pages_file is not None: - parser.error( - "--pages and --pages-file can't be set at same time as they cancel each other out." + "--pages and --detect-pages can't be set at the same time they cancel each other out." ) value = cmd.func(cmd) @@ -144,7 +136,7 @@ def validate_wacz(res): validation_tests = [] if version == OUTDATED_WACZ: - print("Validation Succeeded the passed Wacz is outdate but valid") + print("Validation succeeded, the passed WACZ is outdated but valid") return 0 elif version == WACZ_VERSION: @@ -156,16 +148,16 @@ def validate_wacz(res): validate.check_data_package_hash_and_sig, ] else: - print("Validation Failed the passed Wacz is invalid") + print("Validation failed, the passed WACZ is invalid") return 1 for func in validation_tests: success = func() if success is False: - print("Validation Failed the passed Wacz is invalid") + print("Validation failed, the passed WACZ is invalid") return 1 - print("Validation Succeeded the passed Wacz is valid") + print("Validation succeeded, the passed WACZ is valid") return 0 @@ -190,55 +182,60 @@ def create_wacz(res): passed_pages_dict = {} # Handle pages - if res.pages_file is not None: - for page_file in res.pages_file: - page_file = os.path.abspath(page_file) - filename = os.path.basename(page_file) - - if filename == "pages.jsonl": - with wacz.open(pages_jsonl, "w") as page_jsonl_file: - with open(page_file, "rb") as in_fh: - shutil.copyfileobj(in_fh, page_jsonl_file) - - if filename == "extraPages.jsonl": - with wacz.open(extra_pages_jsonl, "w") as extra_page_file: - with open(page_file, "rb") as in_fh: - shutil.copyfileobj(in_fh, extra_page_file) - if res.pages != None: - print("Validating passed pages.jsonl file") - passed_content = [] - with open(res.pages, "rb") as fh: - for line in fh: - if not line: - continue - - try: - line = line.decode("utf-8") - passed_content.append(line) - except: - print("Page data not utf-8 encoded, skipping", line) - - # Create a dict of the passed pages that will be used in the construction of the index - passed_pages_dict = construct_passed_pages_dict(passed_content) + if res.copy_pages: + print("Copying passed pages.jsonl file to WACZ") + + if not validate_pages_jsonl_file(res.pages): + print("Unable to create WACZ without valid pages.jsonl file, quitting") + wacz.close() + return 1 + + with open(res.pages, "rb") as fh: + with wacz.open(pages_jsonl, "w") as pages_file: + shutil.copyfileobj(fh, pages_file) + + else: + print("Validating passed pages.jsonl file") + passed_content = [] + with open(res.pages, "rb") as fh: + for line in fh: + if not line: + continue + + try: + line = line.decode("utf-8") + passed_content.append(line) + except: + print("Page data not utf-8 encoded, skipping", line) + + # Create a dict of the passed pages that will be used in the construction of the index + passed_pages_dict = construct_passed_pages_dict(passed_content) if res.extra_pages: - print("Validating extra pages file") - extra_page_data = [] - with open(res.extra_pages) as fh: - data = fh.read() - for page_str in data.strip().split("\n"): - page_json = validateJSON(page_str) - - if not page_json: - print("Warning: Ignoring invalid extra page\n %s" % page_str) - continue - - extra_page_data.append(page_str.encode("utf-8")) - - extra_pages_file = zipfile.ZipInfo(EXTRA_PAGES_INDEX, now()) - with wacz.open(extra_pages_file, "w") as efh: - efh.write(b"\n".join(extra_page_data)) + if res.copy_pages: + print("Copying passed pages.jsonl file to WACZ") + if validate_pages_jsonl_file(res.extra_pages): + with open(res.pages, "rb") as fh: + with wacz.open(extra_pages_jsonl, "w") as extra_pages_file: + shutil.copyfileobj(fh, extra_pages_file) + else: + print("Validating extra pages file") + extra_page_data = [] + with open(res.extra_pages) as fh: + data = fh.read() + for page_str in data.strip().split("\n"): + page_json = validateJSON(page_str) + + if not page_json: + print("Warning: Ignoring invalid extra page\n %s" % page_str) + continue + + extra_page_data.append(page_str.encode("utf-8")) + + extra_pages_file = zipfile.ZipInfo(EXTRA_PAGES_INDEX, now()) + with wacz.open(extra_pages_file, "w") as efh: + efh.write(b"\n".join(extra_page_data)) print("Reading and Indexing All WARCs") with wacz.open(data_file, "w") as data: @@ -300,7 +297,7 @@ def create_wacz(res): shutil.copyfileobj(in_fh, out_fh) path = "logs/{}".format(log_file) - if len(wacz_indexer.pages) > 0 and res.pages == None and res.pages_file is None: + if len(wacz_indexer.pages) > 0 and res.pages == None and not res.copy_pages: print("Generating page index...") # generate pages/text wacz_indexer.write_page_list( @@ -314,7 +311,7 @@ def create_wacz(res): ), ) - if len(wacz_indexer.pages) > 0 and res.pages != None and res.pages_file is None: + if len(wacz_indexer.pages) > 0 and res.pages != None and not res.copy_pages: print("Generating page index from passed pages...") # Initially set the default value of the header id and title id_value = "pages" @@ -345,7 +342,7 @@ def create_wacz(res): ), ) - if len(wacz_indexer.extra_pages) > 0 and res.pages_file is None: + if len(wacz_indexer.extra_pages) > 0 and not res.copy_pages: wacz_indexer.write_page_list( wacz, EXTRA_PAGES_INDEX, @@ -357,7 +354,7 @@ def create_wacz(res): ), ) - if len(wacz_indexer.extra_page_lists) > 0 and res.pages_file is None: + if len(wacz_indexer.extra_page_lists) > 0 and not res.copy_pages: print("Generating extra page lists...") for name, pagelist in wacz_indexer.extra_page_lists.items(): diff --git a/wacz/util.py b/wacz/util.py index 8db1cd3..0a5247b 100644 --- a/wacz/util.py +++ b/wacz/util.py @@ -1,4 +1,4 @@ -import hashlib, datetime, json +import hashlib, datetime, json, os from warcio.timeutils import iso_date_to_timestamp import pkg_resources @@ -58,6 +58,13 @@ def hash_stream(hash_type, stream): return size, hash_type + ":" + hasher.hexdigest() +def hash_file(type_, filename): + with open(filename, "rb") as fh: + size_, hash_ = hash_stream(type_, fh) + + return hash_ + + def construct_passed_pages_dict(passed_pages_list): """Creates a dictionary of the passed pages with the url as the key or ts/url if ts is present and the title and text as the values if they have been passed""" passed_pages_dict = {} @@ -100,3 +107,32 @@ def validateJSON(jsonData): except ValueError as err: return False return True + + +def validate_pages_jsonl_file(json_file_path): + """Attempt to validate pages.jsonl file""" + filename = os.path.basename(json_file_path) + if not filename.endswith(".jsonl"): + return False + + line_index = 0 + + with open(json_file_path, "r") as jsonl_file: + for line in jsonl_file: + try: + data = json.loads(line) + if line_index == 0: + data["format"] + data["id"] + else: + data["url"] + data["ts"] + line_index += 1 + except json.JSONDecodeError: + print(f"File {filename} is invalid JSONL") + return False + except KeyError: + print(f"File {filename} missing required fields") + return False + + return True