Skip to content

Commit

Permalink
Switch to using -c/--copy-pages flag to affect pages
Browse files Browse the repository at this point in the history
Add tests with new pages file fixtures
  • Loading branch information
tw4l committed Apr 11, 2024
1 parent 9f4a480 commit b4d2383
Show file tree
Hide file tree
Showing 10 changed files with 278 additions and 81 deletions.
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,20 @@ Overrides the pages index generation with the passed jsonl pages.
wacz create tests/fixtures/example-collection.warc -p passed_pages.jsonl
```

### --pages-file
### -e --extra-pages

Overrides the pages index generation by copying existing pages.jsonl and/or extraPages.jsonl directly into the WACZ. Incompatible with --detect-pages and -p/--pages options.
Overrides the extra pages index generation with the passed extra jsonl pages.

```
wacz create tests/fixtures/example-collection.warc --pages-file pages/pages.jsonl --pages-file pages/extraPages.jsonl
wacz create tests/fixtures/example-collection.warc -p passed_pages.jsonl -e extra_pages.jsonl
```

### -c --copy-pages

Overrides the behavior of --pages and --extra-pages options to copy existing pages.jsonl and/or extraPages.jsonl as-is directly into the WACZ rather than parsing their contents.

```
wacz create tests/fixtures/example-collection.warc --pages pages/pages.jsonl --extra-pages pages/extraPages.jsonl --copy-pages
```

### -t --text
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
!*.warc.gz
!*.wacz
!*.jsonl
!pages/*
4 changes: 4 additions & 0 deletions tests/fixtures/pages/extraPages.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"}
{"id": "e33b4ca5-ce1d-46b2-83ea-405c43b949c5", "url": "https://webrecorder.net/tools", "title": "Webrecorder | Tools", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:22Z"}
{"id": "d026299c-3e37-4473-bcb4-742bc005b25d", "url": "https://webrecorder.net/blog", "title": "Webrecorder | Blog", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
{"id": "726e4e11-abb5-447d-b0be-61c4de7bb4b1", "url": "https://webrecorder.net/community", "title": "Webrecorder | Community", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
2 changes: 2 additions & 0 deletions tests/fixtures/pages/invalid.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{id": "extra-pages", "title": "Extra Pages"}
{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": null, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
1 change: 1 addition & 0 deletions tests/fixtures/pages/invalid.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Not a JSONL file
2 changes: 2 additions & 0 deletions tests/fixtures/pages/pages.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
{"id": "3e01410a-e0a8-4b6f-8a6a-fca6302d9916", "url": "https://webrecorder.net/", "title": "Webrecorder", "loadState": 4, "status": 200, "seed": true, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:17Z"}
9 changes: 1 addition & 8 deletions tests/test_create_wacz.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
import unittest, os, zipfile, sys, gzip, json, tempfile
from wacz.main import main, now
from unittest.mock import patch
from wacz.util import hash_stream
from wacz.util import hash_file
from frictionless import validate, Report

TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures")


def hash_file(type_, filename):
with open(filename, "rb") as fh:
size_, hash_ = hash_stream(type_, fh)

return hash_


class TestWaczFormat(unittest.TestCase):
def find_resource(self, resource_list, filename):
for file in resource_list:
Expand Down
153 changes: 153 additions & 0 deletions tests/test_optional_flags_wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import os
import zipfile, json, gzip
from wacz.main import main, now
from wacz.util import hash_file
from unittest.mock import patch
import jsonlines

TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures")
PAGES_DIR = os.path.join(TEST_DIR, "pages")


class TestWaczFormat(unittest.TestCase):
Expand Down Expand Up @@ -35,6 +37,95 @@ def test_warc_with_invalid_passed_pages(self):
0,
)

def test_invalid_passed_pages_copy_pages(self):
"""If a user passes an invalid pages.jsonl file using --page --copy-pages we should return an error"""
with tempfile.TemporaryDirectory() as tmpdir:
self.assertEqual(
main(
[
"create",
"-f",
os.path.join(TEST_DIR, "example-collection.warc"),
"-o",
os.path.join(
tmpdir, "example-collection-invalid-copy-pages.wacz"
),
"-p",
os.path.join(PAGES_DIR, "invalid.jsonl"),
"--copy-pages",
]
),
1,
)

self.assertEqual(
main(
[
"create",
"-f",
os.path.join(TEST_DIR, "example-collection.warc"),
"-o",
os.path.join(
tmpdir, "example-collection-invalid-copy-pages-txt.wacz"
),
"-p",
os.path.join(PAGES_DIR, "invalid.txt"),
"--copy-pages",
]
),
1,
)

def test_invalid_passed_extra_pages_copy_pages(self):
"""If a user passes an invalid extarPages.jsonl file using -e --copy-pages we still create WACZ without extra pages"""
with tempfile.TemporaryDirectory() as tmpdir:
self.assertEqual(
main(
[
"create",
"-f",
os.path.join(TEST_DIR, "example-collection.warc"),
"-o",
os.path.join(
tmpdir, "example-collection-invalid-copy-extra-pages.wacz"
),
"-p",
os.path.join(PAGES_DIR, "pages.jsonl"),
"-e",
os.path.join(PAGES_DIR, "invalid.txt"),
"--copy-pages",
]
),
0,
)

with zipfile.ZipFile(
os.path.join(
tmpdir, "example-collection-invalid-copy-extra-pages.wacz"
),
"r",
) as zip_ref:
zip_ref.extractall(os.path.join(tmpdir, "wacz_no_extra_pages"))
zip_ref.close()

self.assertEqual(
main(
[
"validate",
"-f",
os.path.join(
tmpdir, "example-collection-invalid-copy-extra-pages.wacz"
),
]
),
0,
)

self.assertFalse(
"extraPages.jsonl"
in os.listdir(os.path.join(tmpdir, "wacz_no_extra_pages/pages/"))
)

@patch("wacz.main.now")
def test_warc_with_pages_flag(self, mock_now):
"""When passing the pages flag with a valid pages.jsonl file a pages/pages.jsonl file should be created"""
Expand Down Expand Up @@ -95,6 +186,68 @@ def test_warc_with_pages_flag(self, mock_now):
self.assertTrue("url" in obj.keys())
self.assertTrue(obj["url"].encode() in cdx_content)

@patch("wacz.main.now")
def test_warc_with_copy_pages(self, mock_now):
"""When passing the pages and extra-pages flags with copy-pages, the files should end up in the WACZ exactly as-is"""
mock_now.return_value = (2020, 10, 7, 22, 29, 10)

with tempfile.TemporaryDirectory() as tmpdir:
self.assertEqual(
main(
[
"create",
"-f",
os.path.join(TEST_DIR, "example-collection.warc"),
"-o",
os.path.join(tmpdir, "example-collection-copy-pages.wacz"),
"-p",
os.path.join(PAGES_DIR, "pages.jsonl"),
"-e",
os.path.join(PAGES_DIR, "extraPages.jsonl"),
"--copy-pages",
]
),
0,
)

with zipfile.ZipFile(
os.path.join(tmpdir, "example-collection-copy-pages.wacz"), "r"
) as zip_ref:
zip_ref.extractall(os.path.join(tmpdir, "unzipped_copy_pages"))
zip_ref.close()

self.assertEqual(
main(
[
"validate",
"-f",
os.path.join(tmpdir, "example-collection-copy-pages.wacz"),
]
),
0,
)

wacz_pages = os.path.join(tmpdir, "unzipped_copy_pages/pages/pages.jsonl")
wacz_extra_pages = os.path.join(
tmpdir, "unzipped_copy_pages/pages/extraPages.jsonl"
)

self.assertTrue(
"pages.jsonl"
in os.listdir(os.path.join(tmpdir, "unzipped_copy_pages/pages/"))
)
self.assertTrue(
"extraPages.jsonl"
in os.listdir(os.path.join(tmpdir, "unzipped_copy_pages/pages/"))
)

self.assertEqual(
hash_file(wacz_pages, os.path.join(PAGES_DIR, "pages.json"))
)
self.assertEqual(
hash_file(wacz_extra_pages, os.path.join(PAGES_DIR, "extraPages.json"))
)

@patch("wacz.main.now")
def test_warc_with_detect_pages_flag(self, mock_now):
"""When passing the text index flag pages/pages.jsonl should be generated."""
Expand Down
Loading

0 comments on commit b4d2383

Please sign in to comment.