From bc8571e347fc366b72453d886db56b99748a9ed9 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Wed, 12 Jun 2024 07:34:41 -0400 Subject: [PATCH 001/133] Delete prior importer script. --- deepwell/scripts/importer/.gitignore | 2 - deepwell/scripts/importer/README.md | 2 - deepwell/scripts/importer/__init__.py | 4 - deepwell/scripts/importer/constants.py | 4 - deepwell/scripts/importer/counter.py | 10 - deepwell/scripts/importer/generator.py | 314 --------------------- deepwell/scripts/importer/requirements.txt | 2 - deepwell/scripts/importer/scuttle.py | 5 - deepwell/scripts/importer/structures.py | 72 ----- deepwell/scripts/importer/utils.py | 29 -- deepwell/scripts/importer/wikicomma.py | 287 ------------------- deepwell/scripts/wikicomma_import.py | 84 ------ 12 files changed, 815 deletions(-) delete mode 100644 deepwell/scripts/importer/.gitignore delete mode 100644 deepwell/scripts/importer/README.md delete mode 100644 deepwell/scripts/importer/__init__.py delete mode 100644 deepwell/scripts/importer/constants.py delete mode 100644 deepwell/scripts/importer/counter.py delete mode 100644 deepwell/scripts/importer/generator.py delete mode 100644 deepwell/scripts/importer/requirements.txt delete mode 100644 deepwell/scripts/importer/scuttle.py delete mode 100644 deepwell/scripts/importer/structures.py delete mode 100644 deepwell/scripts/importer/utils.py delete mode 100644 deepwell/scripts/importer/wikicomma.py delete mode 100755 deepwell/scripts/wikicomma_import.py diff --git a/deepwell/scripts/importer/.gitignore b/deepwell/scripts/importer/.gitignore deleted file mode 100644 index 43ae0e2a6c..0000000000 --- a/deepwell/scripts/importer/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -__pycache__/ -*.py[cod] diff --git a/deepwell/scripts/importer/README.md b/deepwell/scripts/importer/README.md deleted file mode 100644 index 639c94fe48..0000000000 --- a/deepwell/scripts/importer/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## deepwell-importer -This is a Python framework to generate a SQL file which seeds a DEEPWELL database. The goal is to provide a simple, generic interface which can be used in the implementation of content seeders. diff --git a/deepwell/scripts/importer/__init__.py b/deepwell/scripts/importer/__init__.py deleted file mode 100644 index 5dd5e9bbd4..0000000000 --- a/deepwell/scripts/importer/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .generator import generate_seed -from .structures import * -from .scuttle import run_scuttle_import -from .wikicomma import run_wikicomma_import diff --git a/deepwell/scripts/importer/constants.py b/deepwell/scripts/importer/constants.py deleted file mode 100644 index 32bf86745c..0000000000 --- a/deepwell/scripts/importer/constants.py +++ /dev/null @@ -1,4 +0,0 @@ -from datetime import datetime - -ANONYMOUS_USER_ID = 3 -UNKNOWN_CREATION_DATE = datetime.utcfromtimestamp(0) diff --git a/deepwell/scripts/importer/counter.py b/deepwell/scripts/importer/counter.py deleted file mode 100644 index 445fab0904..0000000000 --- a/deepwell/scripts/importer/counter.py +++ /dev/null @@ -1,10 +0,0 @@ -from dataclasses import dataclass - - -@dataclass -class IncrementingCounter: - value: int = 0 - - def next(self) -> int: - self.value += 1 - return self.value diff --git a/deepwell/scripts/importer/generator.py b/deepwell/scripts/importer/generator.py deleted file mode 100644 index 4982f46390..0000000000 --- a/deepwell/scripts/importer/generator.py +++ /dev/null @@ -1,314 +0,0 @@ -import hashlib -from binascii import hexlify -from typing import Iterable, Optional, Set, Union - -from .constants import * -from .counter import IncrementingCounter -from .structures import * -from .utils import get_page_category, wikidot_id_or_auto - -import psycopg2 - - -class Generator: - """ - Generates SQL and S3 invocations. - - This produces a SQL file to ingest data into DEEPWELL, as well as a - shells cript which invokes the aws utility to upload data to S3. - - The class also tracks the state of all imported Wikidot data, - as encountered. This is necessary to avoid inserting - duplicate data. - """ - - __slots__ = ( - "sql_buffer", - "sh_buffer", - "cursor", - "s3_bucket", - "page_category_id", - "user_ids", - "user_slugs", - "site_ids", - "site_slugs", - "page_ids", - "page_slugs", - "page_revision_ids", - "page_revision_numbers", - "page_categories", - "file_names", - "blob_hashes", - "text_hashes", - ) - - def __init__(self, sql_buffer, sh_buffer, cursor, s3_bucket, last_page_category_id): - self.sql_buffer = sql_buffer - self.sh_buffer = sh_buffer - self.cursor = cursor - self.s3_bucket = s3_bucket - self.page_category_id = IncrementingCounter(last_page_category_id) - - self.user_ids, self.user_slugs = set(), set() # Set[int], Set[str] - self.site_ids, self.site_slugs = set(), set() # Set[int], Set[str] - self.page_ids, self.page_slugs = set(), set() # Set[int], Set[Tuple[int, str]] - self.page_revision_ids = set() # Set[int] - self.page_revision_numbers = set() # Set[Tuple[int, int]] - self.page_categories = {} # dict[Tuple[int, str], int] - self.file_names = set() # Set[Tuple[int, str]] - self.blob_hashes = {} # dict[bytes, str] - self.text_hashes = set() # Set[bytes] - - self.sql_buffer.write("-- AUTO-GENERATED FILE\n") - self.sh_buffer.write("# AUTO-GENERATED FILE\n") - - def format(self, query: str, parameters=()) -> str: - return self.cursor.mogrify(query, parameters).decode("utf-8") - - def append_sql(self, query: str, parameters=()): - sql_line = self.format(query, parameters) - self.sql_buffer.write(f"{sql_line};\n") - - def section_sql(self, name: str): - self.sql_buffer.write(f"\n\n--\n-- {name}\n--\n\n") - - def append_sh(self, data: bytes, data_hash: bytes): - def bash_escape(d: bytes) -> str: - r"""Bash-escape binary strings. e.g. $'\x00'""" - - inner = "".join(f"\\x{b:02x}" for b in d) - return f"$'{inner}'" - - data_hash_hex = hexlify(data_hash).decode("utf-8") - bucket_path = f"s3://{self.s3_bucket}/{data_hash_hex}" - - self.sh_buffer.write( - 'file="$(mktemp)"\n' - f"printf '%s' {bash_escape(data)} > \"$file\"\n" - f'aws cp "$file" {bucket_path}\n' - f'rm "$file"\n\n' - ) - - return bucket_path - - def section_sh(self, name: str): - self.sh_buffer.write(f"\n\n#\n# {name}\n#\n\n") - - def add_user(self, user: User): - if ( - self.id_exists(self.user_ids, user.wikidot_id) - or user.slug in self.user_slugs - ): - return - - avatar_path = self.add_blob(user.avatar) - - # TODO change over when user table changes, remaining fields - self.append_sql( - "INSERT INTO users (id, slug, username, avatar_path, created_at) VALUES (%s, %s, %s, %s)", - (wikidot_id_or_auto(user), user.slug, user.name, avatar_path, user.created_at), - ) - - self.id_add(self.user_ids, user.wikidot_id) - self.user_slugs.add(user.slug) - - def add_site(self, site: Site): - if ( - self.id_exists(self.site_ids, site.wikidot_id) - or site.slug in self.site_slugs - ): - return - - self.append_sql( - "INSERT INTO site (site_id, name, slug, subtitle, description) VALUES (%s, %s, %s, %s, %s)", - (wikidot_id_or_auto(site), site.name, site.slug, site.subtitle, site.description), - ) - - self.id_add(self.site_ids, site.wikidot_id) - self.site_slugs.add(site.slug) - - def add_page(self, page: Page): - if ( - self.id_exists(self.page_ids, page.wikidot_id) - or (page.site_id, page.slug) in self.page_slugs - ): - return - - page_category_id = self.add_page_category( - page.site_id, get_page_category(page.slug), - ) - - self.append_sql( - "INSERT INTO page (page_id, created_at, updated_at, site_id, page_category_id, slug, discussion_thread_id) VALUES (%s, %s, %s, %s, %s, %s, %s)", - ( - wikidot_id_or_auto(page), - page.created_at, - page.updated_at, - page.site_id, - page_category_id, - page.slug, - page.discussion_thread_id, - ), - ) - - self.id_add(self.page_ids, page.wikidot_id) - self.page_slugs.add((page.site_id, page.slug)) - - def add_page_revisions(self, revisions: Iterable[PageRevision]): - for revision in revisions: - self.add_page_revision(revision) - - def add_page_revision(self, revision: PageRevision): - if ( - self.id_exists(self.page_revision_ids, revision.wikidot_id) - or (revision.page_id, revision.revision_number) - in self.page_revision_numbers - ): - return - - if revision.flags == "N" or revision.revision_number == 0: - revision_type = "created" - elif revision.flags == "R": - revision_type = "move" - else: - revision_type = "regular" - - wikitext_hash = self.add_text(revision.wikitext) - compiled_hash = self.add_text(revision.html) - - # TODO per-revision fields? - self.append_sql( - "INSERT INTO page_revision (revision_id, revision_type, revision_number, created_at, page_id, site_id, user_id, wikitext_hash, compiled_hash, compiled_at, compiled_generator, slug, title, tags, comments) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", - ( - wikidot_id_or_auto(revision), - revision_type, - revision.revision_number, - revision.created_at, - revision.page_id, - revision.site_id, - revision.user_id, - wikitext_hash, - compiled_hash, - revision.created_at, - "Imported from Wikidot", - revision.slug, - revision.title, - revision.tags, - revision.comments, - ), - ) - - self.id_add(self.page_revision_ids, revision.wikidot_id) - self.page_revision_numbers.add((revision.page_id, revision.revision_number)) - - def add_page_votes(self, votes: Iterable[PageVote]): - for vote in votes: - self.add_page_vote(vote) - - def add_page_vote(self, vote: PageVote): - self.append_sql( - "INSERT INTO page_vote (created_at, page_id, user_id, value) VALUES (%s, %s, %s, %s)", - (UNKNOWN_CREATION_DATE, vote.page_id, vote.user_id, vote.value), - ) - - def add_page_lock(self, page_id: int, locked: bool = True): - if locked: - self.append_sql( - "INSERT INTO page_lock (created_at, lock_type, page_id, user_id, reason) VALUES (%s, %s, %s, %s, %s)", - ( - UNKNOWN_CREATION_DATE, - "wikidot", - page_id, - ANONYMOUS_USER_ID, - "Imported from Wikidot", - ), - ) - - def add_page_category(self, site_id: int, category_slug: str) -> int: - page_category_id = self.page_categories.get((site_id, category_slug)) - - if page_category_id is None: - page_category_id = self.page_category_id.next() - self.append_sql( - "INSERT INTO page_category (category_id, site_id, slug) VALUES (%s, %s, %s)", - (page_category_id, site_id, category_slug), - ) - - return page_category_id - - def add_file(self, file: File): - if ( - self.id_exists(self.file_ids, file.wikidot_id) - or (file.page_id, file.name) in self.file_names - ): - return - - self.append_sql( - "INSERT INTO file (file_id, created_at, name, page_id) VALUES (%s, %s, %s, %s)", - (wikidot_id_or_auto(file), file.created_at, file.name, file.page_id), - ) - self.file_names.add((file.page_id, file.name)) - - # TODO add forums - - def add_blob(self, data: bytes) -> str: - data_hash = hashlib.sha512(data).digest() - s3_url = self.blob_hashes.get(data_hash) - - if s3_url is None: - s3_url = self.append_sh(data, data_hash) - self.blob_hashes[data_hash] = s3_url - - return s3_url - - def add_text(self, text: str) -> bytes: - text_bytes = text.encode("utf-8") - text_hash = hashlib.sha512(text_bytes).digest() - - if text_hash not in self.text_hashes: - self.append_sql( - "INSERT INTO text (hash, contents) VALUES (%s, %s)", (text_hash, text), - ) - self.text_hashes.add(text_hash) - - return text_hash - - def id_exists(self, field: Set[int], id: Optional[int]) -> bool: - if id is None: - return False - - return id in field - - def id_add(self, field: Set[int], id: Optional[int]): - if id is None: - return - - field.add(id) - - -def generate_seed( - runner: callable, - *, - sql_path: str, - sh_path: str, - s3_bucket: str, - postgres_url: str, - last_page_category_id: int = 0, -): - """ - Given a function which takes a Generator, run through whatever backup and add all the relevant information. - The generator will ensure duplicate data is not added. - """ - - with open(sql_path, "w") as sql_file: - with open(sh_path, "w") as sh_file: - with psycopg2.connect(postgres_url) as connection: - with connection.cursor() as cursor: - generator = Generator( - sql_file, - sh_file, - cursor, - s3_bucket, - last_page_category_id, - ) - runner(generator) diff --git a/deepwell/scripts/importer/requirements.txt b/deepwell/scripts/importer/requirements.txt deleted file mode 100644 index 587febffe8..0000000000 --- a/deepwell/scripts/importer/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -psycopg2>=2.9.3 -py7zr>=0.19.0 diff --git a/deepwell/scripts/importer/scuttle.py b/deepwell/scripts/importer/scuttle.py deleted file mode 100644 index 8e6f248b0d..0000000000 --- a/deepwell/scripts/importer/scuttle.py +++ /dev/null @@ -1,5 +0,0 @@ -# TODO - - -def run_scuttle_import(): - raise NotImplementedError diff --git a/deepwell/scripts/importer/structures.py b/deepwell/scripts/importer/structures.py deleted file mode 100644 index 48c0a86ea5..0000000000 --- a/deepwell/scripts/importer/structures.py +++ /dev/null @@ -1,72 +0,0 @@ -from dataclasses import dataclass -from datetime import datetime -from typing import List, Optional - - -@dataclass(frozen=True) -class User: - # None means the ID isn't known, so we should assign our own - wikidot_id: Optional[int] - created_at: datetime - name: str - slug: str - avatar: bytes - - -@dataclass(frozen=True) -class Site: - wikidot_id: Optional[int] - created_at: datetime - name: str - slug: str - subtitle: str - description: str - - -@dataclass(frozen=True) -class Page: - wikidot_id: Optional[int] - created_at: datetime - updated_at: datetime - site_id: int - title: str - slug: str - discussion_thread_id: Optional[int] - - -@dataclass(frozen=True) -class PageRevision: - wikidot_id: Optional[int] - revision_number: int - created_at: datetime - flags: str - page_id: int - site_id: int - user_id: int - wikitext: str - html: str - slug: str - title: str - tags: List[str] - comments: str - - -@dataclass(frozen=True) -class PageVote: - page_id: int - user_id: int - value: int - - -@dataclass(frozen=True) -class File: - wikidot_id: Optional[int] - page_id: int - name: str - mime: str - size: int - user_id: int - created_at: datetime - - -# TODO forums diff --git a/deepwell/scripts/importer/utils.py b/deepwell/scripts/importer/utils.py deleted file mode 100644 index 17999a981b..0000000000 --- a/deepwell/scripts/importer/utils.py +++ /dev/null @@ -1,29 +0,0 @@ -from psycopg2.extensions import register_adapter, AsIs - - -def get_page_category(page_slug): - parts = page_slug.split(":") - if len(parts) == 1: - return "_default" - - return parts[0] - - -class SqlRaw: - __slots__ = ("value",) - - def __init__(self, value: str): - self.value = value - - def adapt(self): - return AsIs(self.value) - - -def wikidot_id_or_auto(item): - if item.wikidot_id is None: - return SqlRaw("DEFAULT") - else: - return item.wikidot_id - - -register_adapter(SqlRaw, SqlRaw.adapt) diff --git a/deepwell/scripts/importer/wikicomma.py b/deepwell/scripts/importer/wikicomma.py deleted file mode 100644 index 5a417dd037..0000000000 --- a/deepwell/scripts/importer/wikicomma.py +++ /dev/null @@ -1,287 +0,0 @@ -import json -import logging -import os -import re -from datetime import datetime - -from .constants import UNKNOWN_CREATION_DATE -from .generator import generate_seed -from .structures import * - -from py7zr import SevenZipFile - -REVISION_FILENAME_REGEX = re.compile(r"(\d+)\.txt") - -logger = logging.getLogger(__name__) - - -class WikicommaImporter: - __slots__ = ( - "generator", - "directory", - "replace_colon", - ) - - def __init__(self, generator, directory, replace_colon=True): - self.generator = generator - self.directory = directory - self.replace_colon = replace_colon - - def process_all(self): - logger.info("Processing all sites") - self.generator.section_sql("Wikicomma") - self.generator.section_sh("Files") - - for site_slug in os.listdir(self.directory): - self.process_site(site_slug) - - def process_site(self, site_slug): - logger.info("Processing site %s", site_slug) - self.generator.section_sql(f"Site: {site_slug}") - - # Add site - unknown_description = f"[NEEDS UPDATE] {site_slug}" - self.generator.add_site( - Site( - wikidot_id=None, - created_at=UNKNOWN_CREATION_DATE, - name=unknown_description, - slug=site_slug, - subtitle=unknown_description, - description=unknown_description, - ) - ) - - # Process site internals - site_directory = os.path.join(self.directory, site_slug) - self.process_site_pages(site_slug, site_directory) - self.process_site_forum(site_slug, site_directory) - - def process_site_pages(self, site_slug: str, site_directory: str): - page_mapping = self.read_json(site_directory, "meta", "page_id_map.json") - file_mapping = self.read_json(site_directory, "meta", "file_map.json") - logger.info("Processing %d pages", len(page_mapping)) - - def get_first_last_revisions(revisions: List[dict]): - # Since the revision list isn't always in order... - start_revision = revisions[0] - last_revision = revisions[0] - - for revision in revisions: - if revision["revision"] < start_revision["revision"]: - start_revision = revision - - if revision["revision"] > last_revision["revision"]: - last_revision = revision - - return start_revision, last_revision - - for page_id, page_slug in page_mapping.items(): - self.generator.section_sql(f"Page: {page_slug}") - page_id = int(page_id) - metadata = self.read_page_metadata(site_directory, page_slug) - start_revision, last_revision = get_first_last_revisions( - metadata["revisions"] - ) - created_at = datetime.fromtimestamp(start_revision["stamp"]) - updated_at = datetime.fromtimestamp(last_revision["stamp"]) - site_id = -1 # TODO unknown - - self.generator.add_page( - Page( - wikidot_id=page_id, - created_at=created_at, - updated_at=updated_at, - site_id=site_id, - title=metadata.get("title", ""), - slug=page_slug, - discussion_thread_id=None, # TODO unknown - ) - ) - self.generator.add_page_lock(page_id, metadata.get("is_locked", False)) - self.process_page_revisions(site_directory, site_id, metadata) - self.process_page_files( - site_directory, - page_id, - file_mapping, - metadata["files"], - ) - self.process_page_votes(metadata) - - def process_page_revisions(self, site_directory: str, site_id: int, metadata: dict): - page_slug = metadata["name"] - page_id = metadata["page_id"] - # NOTE: We don't know what these are historically, - title = metadata.get("title", "") - tags = metadata.get("tags", []) - logger.info("Processing revisions for page %s (%d)", page_slug, page_id) - - wikitext_mapping = {} - with self.open_page_revisions(site_directory, page_slug) as archive: - for filename, data in archive.readall().items(): - match = REVISION_FILENAME_REGEX.fullmatch(filename) - revision_number = int(match[1]) - wikitext = data.read().decode("utf-8") - - for revision in metadata["revisions"]: - revision_number = revision["revision"] - user_spec = revision["author"] - logger.debug("Processing revision number %d", revision_number) - - # Is user slug, not a user ID - if isinstance(user_spec, str): - # TODO get ID - logger.warn("Skipping revision, unknown user: %s", user_spec) - continue - - wikitext = wikitext_mapping.get(revision_number) - if wikitext is None: - logger.error("No wikitext found for revision number %d", revision_number) - continue - - self.generator.add_page_revision( - PageRevision( - wikidot_id=revision["global_revision"], - revision_number=revision_number, - created_at=datetime.fromtimestamp(revision["stamp"]), - flags=revision["flags"], - page_id=page_id, - site_id=site_id, - user_id=user_spec, - wikitext=wikitext, - slug=page_slug, - title=title, - html="", # TODO not stored - tags=tags, - comments=revision["commentary"], - ) - ) - - def process_page_files( - self, - site_directory: str, - page_id: int, - file_mapping: dict, - metadata_list: list, - ): - logger.info("Processing files for page ID %d", page_id) - - for metadata in metadata_list: - file_id = metadata["file_id"] - logger.debug("Processing file ID %d", file_id) - - user_spec = metadata["author"] - # Is user slug, not a user ID - if isinstance(user_spec, str): - # TODO get ID - logger.warn("Skipping file, unknown user: %s", user_spec) - continue - - file_location = file_mapping[str(file_id)] - file_path = os.path.join(site_directory, "files", file_location["path"]) - if not os.path.exists(file_path): - logger.error("Path %s does not exist", file_path) - continue - - with open(file_path, "rb") as file: - file_data = file.read() - - self.generator.add_file( - File( - wikidot_id=metadata["file_id"], - page_id=page_id, - name=metadata["name"], - mime=metadata["mime"], - size=metadata["size_bytes"], - user_id=user_spec, - created_at=datetime.fromtimestamp(metadata["stamp"]), - ) - ) - - def process_page_votes(self, metadata: dict): - logger.info("Processing %d votes", len(metadata["votings"])) - - for (user_spec, value) in metadata["votings"]: - logger.debug("Processing vote by %s", user_spec) - - # Is user slug, not a user ID - if isinstance(user_spec, str): - # TODO get ID - logger.warn("Skipping vote, unknown user: %s", user_spec) - continue - - # Get vote value - if isinstance(value, bool): - value = +1 if value else -1 - - self.generator.add_page_vote( - PageVote( - page_id=metadata["page_id"], - user_id=user_spec, - value=value, - ) - ) - - def process_site_forum(self, site_slug: str, site_directory: str): - logger.info("Processing forum posts for site %s", site_slug) - self.generator.section_sql(f"Forum: {site_slug} [TODO]") - # TODO - - def read_page_metadata(self, site_directory: str, page_slug: str): - page_metadata_filename = f"{page_slug}.json" - - if self.replace_colon: - page_metadata_filename = page_metadata_filename.replace(":", "_") - - page_metadata = self.read_json( - site_directory, - "meta", - "pages", - page_metadata_filename, - ) - - assert page_metadata["name"] == page_slug - return page_metadata - - def open_page_revisions(self, site_directory: str, page_slug: str): - page_revisions_filename = f"{page_slug}.7z" - - if self.replace_colon: - page_revisions_filename = page_revisions_filename.replace(":", "_") - - page_revisions_path = os.path.join( - site_directory, "pages", page_revisions_filename, - ) - return SevenZipFile(page_revisions_path, "r") - - @staticmethod - def read_json(*path_parts): - path = os.path.join(*path_parts) - - with open(path) as file: - return json.load(file) - - -def run_wikicomma_import( - *, - wikicomma_directory: str, - sql_path: str, - sh_path: str, - s3_bucket: str, - postgres_url: str, - last_page_category_id: int = 0, -): - wikicomma_directory = os.path.normpath(wikicomma_directory) - - def runner(generator): - importer = WikicommaImporter(generator, wikicomma_directory) - importer.process_all() - - generate_seed( - runner, - sql_path=sql_path, - sh_path=sh_path, - s3_bucket=s3_bucket, - postgres_url=postgres_url, - last_page_category_id=last_page_category_id, - ) diff --git a/deepwell/scripts/wikicomma_import.py b/deepwell/scripts/wikicomma_import.py deleted file mode 100755 index 8fb095abdd..0000000000 --- a/deepwell/scripts/wikicomma_import.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import logging -import sys -from importer import run_wikicomma_import - -LOG_FORMAT = "[%(levelname)s] %(asctime)s %(name)s: %(message)s" -LOG_DATE_FORMAT = "[%Y/%m/%d %H:%M:%S]" - -if __name__ == "__main__": - argparser = argparse.ArgumentParser(description="WikiComma importer") - argparser.add_argument( - "-q", - "--quiet", - "--no-stdout", - dest="stdout", - action="store_false", - help="Don't output to standard out.", - ) - argparser.add_argument( - "-D", - "--debug", - dest="debug", - action="store_true", - help="Set logging level to debug.", - ) - argparser.add_argument( - "-d", - "--directory", - "--wikicomma-directory", - dest="wikicomma_directory", - required=True, - help="The directory where WikiComma data resides", - ) - argparser.add_argument( - "-o", - "--sql", - "--output-sql", - dest="sql_path", - required=True, - help="The location to output the SQL dump to", - ) - argparser.add_argument( - "-s", - "--shell", - "--output-shell", - dest="sh_path", - required=True, - help="The location to output the shell dump to", - ) - argparser.add_argument( - "-b", - "--s3", - "--s3-bucket", - dest="s3_bucket", - required=True, - help="The name of the S3 bucket to use (read-only)", - ) - argparser.add_argument( - "-u", - "--postgres-url", - dest="postgres_url", - required=True, - help="The DEEPWELL database to connect to (read-only)", - ) - args = argparser.parse_args() - - log_fmtr = logging.Formatter(LOG_FORMAT, datefmt=LOG_DATE_FORMAT) - log_stdout = logging.StreamHandler(sys.stdout) - log_stdout.setFormatter(log_fmtr) - log_level = logging.DEBUG if args.debug else logging.INFO - - logger = logging.getLogger("importer") - logger.setLevel(level=log_level) - logger.addHandler(log_stdout) - - run_wikicomma_import( - wikicomma_directory=args.wikicomma_directory, - sql_path=args.sql_path, - sh_path=args.sh_path, - s3_bucket=args.s3_bucket, - postgres_url=args.postgres_url, - ) From 553427000ed411db38720a6d99d511f49f0e3581 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Wed, 12 Jun 2024 08:10:12 -0400 Subject: [PATCH 002/133] Start new importer module. --- deepwell/importer/__init__.py | 0 deepwell/importer/__main__.py | 71 +++++++++++++++++++++++++++++++++++ deepwell/importer/importer.py | 22 +++++++++++ 3 files changed, 93 insertions(+) create mode 100644 deepwell/importer/__init__.py create mode 100644 deepwell/importer/__main__.py create mode 100644 deepwell/importer/importer.py diff --git a/deepwell/importer/__init__.py b/deepwell/importer/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deepwell/importer/__main__.py b/deepwell/importer/__main__.py new file mode 100644 index 0000000000..895cb5c645 --- /dev/null +++ b/deepwell/importer/__main__.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +import argparse +import logging +import os +import sys + +from .importer import Importer + +LOG_FORMAT = "[%(levelname)s] [%(asctime)s] %(message)s" +LOG_DATE_FORMAT = "%Y/%m/%d %H:%M:%S" + +if __name__ == "__main__": + argparser = argparse.ArgumentParser(description="WikiComma importer") + argparser.add_argument( + "-q", + "--quiet", + "--no-stdout", + dest="stdout", + action="store_false", + help="Don't output to standard out", + ) + argparser.add_argument( + "-D", + "--debug", + dest="debug", + action="store_true", + help="Set logging level to debug", + ) + argparser.add_argument( + "-d", + "--directory", + "--wikicomma-directory", + dest="wikicomma_directory", + required=True, + help="The directory where WikiComma data resides", + ) + argparser.add_argument( + "-o", + "--sqlite", + "--output-sqlite", + dest="sql_path", + required=True, + help="The location to output the SQLite database to", + ) + argparser.add_argument( + "-P", + "--profile", + "--aws-profile", + dest="aws_profile", + required=True, + help="The AWS profile containing the secrets", + ) + args = argparser.parse_args() + + log_fmtr = logging.Formatter(LOG_FORMAT, datefmt=LOG_DATE_FORMAT) + log_stdout = logging.StreamHandler(sys.stdout) + log_stdout.setFormatter(log_fmtr) + log_level = logging.DEBUG if args.debug else logging.INFO + + logger = logging.getLogger("importer") + logger.setLevel(level=log_level) + logger.addHandler(log_stdout) + + importer = Importer( + logger=logger, + wikicomma_directory=args.wikicomma_directory, + sqlite_path=args.sqlite_path, + aws_profile=args.aws_profile, + ) + importer.run() diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py new file mode 100644 index 0000000000..27a14f1ef5 --- /dev/null +++ b/deepwell/importer/importer.py @@ -0,0 +1,22 @@ +import boto3 + +class Importer: + __slots__ = ( + "logger", + "wikicomma_directory", + "sqlite_path", + "aws_profile", + "boto_session", + "s3_client", + ) + + def __init__(self, *, logger, wikicomma_directory, sqlite_path, aws_profile): + self.logger = logger + self.wikicomma_directory = wikicomma_directory + self.sqlite_path = sqlite_path + self.aws_profile = aws_profile + self.boto_session = boto3.Session(profile_name=aws_profile) + self.s3_client = self.boto_session.client("s3") + + def run(self): + ... From 911afe6010353550fa602fa9545bf888126e2ed1 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Wed, 12 Jun 2024 08:35:07 -0400 Subject: [PATCH 003/133] Start s3 methods. --- deepwell/importer/__main__.py | 8 ++++++++ deepwell/importer/importer.py | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/__main__.py b/deepwell/importer/__main__.py index 895cb5c645..11c26584da 100644 --- a/deepwell/importer/__main__.py +++ b/deepwell/importer/__main__.py @@ -43,6 +43,14 @@ required=True, help="The location to output the SQLite database to", ) + argparser.add_argument( + "-b", + "--bucket", + "--s3-bucket", + dest="s3_bucket", + required=True, + help="The S3 bucket to store uploaded files in", + ) argparser.add_argument( "-P", "--profile", diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index 27a14f1ef5..587d39ca09 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -1,3 +1,5 @@ +import hashlib + import boto3 class Importer: @@ -8,15 +10,45 @@ class Importer: "aws_profile", "boto_session", "s3_client", + "s3_bucket", ) - def __init__(self, *, logger, wikicomma_directory, sqlite_path, aws_profile): + def __init__(self, *, logger, wikicomma_directory, sqlite_path, aws_profile, s3_bucket): self.logger = logger self.wikicomma_directory = wikicomma_directory self.sqlite_path = sqlite_path self.aws_profile = aws_profile self.boto_session = boto3.Session(profile_name=aws_profile) self.s3_client = self.boto_session.client("s3") + self.s3_bucket = s3_bucket + + def s3_object_exists(self, s3_path): + try: + self.s3_client.head_object( + Bucket=self.s3_bucket, + Key=s3_path, + ) + return True + except: + return False + + def upload_file(self, file_path): + with open(path, "rb") as file: + data = file.read() + s3_path = hashlib.sha256(data).hexdigest() + + if not data: + self.logger.debug("Skipping upload of empty S3 object") + elif self.s3_object_exists(s3_path): + self.logger.debug("S3 object %s already exists", s3_path) + else: + self.logger.info("Uploading S3 object %s (len %d)", s3_path, len(data)) + self.s3_client.upload_file( + Bucket=self.s3_bucket, + Key=s3_path, + Body=data, + ContentLength=len(data), + ) def run(self): ... From 19dbb7293d8bb96187d5e145feb7a52e0622b18f Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Wed, 12 Jun 2024 08:43:46 -0400 Subject: [PATCH 004/133] Run black formatter. --- deepwell/importer/importer.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index 587d39ca09..3b8d99f657 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -2,6 +2,7 @@ import boto3 + class Importer: __slots__ = ( "logger", @@ -13,7 +14,15 @@ class Importer: "s3_bucket", ) - def __init__(self, *, logger, wikicomma_directory, sqlite_path, aws_profile, s3_bucket): + def __init__( + self, + *, + logger, + wikicomma_directory, + sqlite_path, + aws_profile, + s3_bucket, + ): self.logger = logger self.wikicomma_directory = wikicomma_directory self.sqlite_path = sqlite_path From 56b8b16f7982dd3cf2ec4f4b52b405f93957e24d Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Wed, 12 Jun 2024 08:52:35 -0400 Subject: [PATCH 005/133] Start SQLite3 connection file. --- deepwell/importer/database.py | 18 ++++++++++++++++++ deepwell/importer/importer.py | 9 +++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 deepwell/importer/database.py diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py new file mode 100644 index 0000000000..c639e9a3af --- /dev/null +++ b/deepwell/importer/database.py @@ -0,0 +1,18 @@ +import os +import sqlite3 + + +class Database: + __slots__ = ("conn",) + + def __init__(self, db_url): + self.conn = sqlite3.connect(db_url) + + def seed(self): + seed_path = os.path.join(os.path.dirname(__file__), "seed.sql") + + with open(seed_path) as file: + self.conn.executescript(file.read()) + + def close(self): + self.conn.close() diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index 3b8d99f657..751b7b4a45 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -1,5 +1,7 @@ import hashlib +from .database import Database + import boto3 @@ -7,7 +9,7 @@ class Importer: __slots__ = ( "logger", "wikicomma_directory", - "sqlite_path", + "database", "aws_profile", "boto_session", "s3_client", @@ -25,7 +27,7 @@ def __init__( ): self.logger = logger self.wikicomma_directory = wikicomma_directory - self.sqlite_path = sqlite_path + self.database = Database(sqlite_path) self.aws_profile = aws_profile self.boto_session = boto3.Session(profile_name=aws_profile) self.s3_client = self.boto_session.client("s3") @@ -61,3 +63,6 @@ def upload_file(self, file_path): def run(self): ... + + def close(self): + self.database.close() From b62c732a5589ca179efc77781038258b93589428 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 08:00:02 -0400 Subject: [PATCH 006/133] Start process methods. --- deepwell/importer/database.py | 6 +++--- deepwell/importer/importer.py | 26 +++++++++++++++++++++----- deepwell/importer/seed.sql | 8 ++++++++ 3 files changed, 32 insertions(+), 8 deletions(-) create mode 100644 deepwell/importer/seed.sql diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index c639e9a3af..f9823111ef 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -5,14 +5,14 @@ class Database: __slots__ = ("conn",) - def __init__(self, db_url): + def __init__(self, db_url: str) -> None: self.conn = sqlite3.connect(db_url) - def seed(self): + def seed(self) -> None: seed_path = os.path.join(os.path.dirname(__file__), "seed.sql") with open(seed_path) as file: self.conn.executescript(file.read()) - def close(self): + def close(self) -> None: self.conn.close() diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index 751b7b4a45..11b0cb02ae 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -1,4 +1,6 @@ +import glob import hashlib +import os from .database import Database @@ -24,7 +26,7 @@ def __init__( sqlite_path, aws_profile, s3_bucket, - ): + ) -> None: self.logger = logger self.wikicomma_directory = wikicomma_directory self.database = Database(sqlite_path) @@ -33,7 +35,7 @@ def __init__( self.s3_client = self.boto_session.client("s3") self.s3_bucket = s3_bucket - def s3_object_exists(self, s3_path): + def s3_object_exists(self, s3_path: str) -> bool: try: self.s3_client.head_object( Bucket=self.s3_bucket, @@ -43,7 +45,7 @@ def s3_object_exists(self, s3_path): except: return False - def upload_file(self, file_path): + def upload_file(self, file_path: str) -> None: with open(path, "rb") as file: data = file.read() s3_path = hashlib.sha256(data).hexdigest() @@ -61,8 +63,22 @@ def upload_file(self, file_path): ContentLength=len(data), ) - def run(self): + def data_dir(self, subdirectory: str) -> str: + return os.path.join(self.wikicomma_directory, subdirectory) + + def run(self) -> None: + self.logger.info("Starting Wikicomma importer...") + + self.database.seed() + self.process_users() ... - def close(self): + def close(self) -> None: self.database.close() + + def process_users(self): + self.logger.info("Processing users...") + + directory = self.data_dir("_users") + for path in glob.iglob(f"{directory}/*.json"): + self.logger.debug("+ {path}") diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql new file mode 100644 index 0000000000..6e60dd43ae --- /dev/null +++ b/deepwell/importer/seed.sql @@ -0,0 +1,8 @@ +CREATE TABLE blob ( +); + +CREATE TABLE file ( +); + +CREATE TABLE user ( +); From 12386407f4477746edf9967ae6dfd4f3a30abf8e Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 09:30:33 -0400 Subject: [PATCH 007/133] Add import utility gitignore. --- deepwell/importer/.gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 deepwell/importer/.gitignore diff --git a/deepwell/importer/.gitignore b/deepwell/importer/.gitignore new file mode 100644 index 0000000000..43ae0e2a6c --- /dev/null +++ b/deepwell/importer/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +*.py[cod] From 8d6c9159ff557f29e320f1d5d293b2dba9ce7e25 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 09:32:58 -0400 Subject: [PATCH 008/133] Add user ingest method. --- deepwell/importer/__main__.py | 21 ++++++------- deepwell/importer/database.py | 57 ++++++++++++++++++++++++++++++++++- deepwell/importer/importer.py | 27 +++++++++++------ deepwell/importer/seed.sql | 29 ++++++++++++++++++ deepwell/importer/utils.py | 8 +++++ 5 files changed, 120 insertions(+), 22 deletions(-) create mode 100644 deepwell/importer/utils.py diff --git a/deepwell/importer/__main__.py b/deepwell/importer/__main__.py index 11c26584da..f3c046b9fe 100644 --- a/deepwell/importer/__main__.py +++ b/deepwell/importer/__main__.py @@ -20,13 +20,6 @@ action="store_false", help="Don't output to standard out", ) - argparser.add_argument( - "-D", - "--debug", - dest="debug", - action="store_true", - help="Set logging level to debug", - ) argparser.add_argument( "-d", "--directory", @@ -39,10 +32,16 @@ "-o", "--sqlite", "--output-sqlite", - dest="sql_path", + dest="sqlite_path", required=True, help="The location to output the SQLite database to", ) + argparser.add_argument( + "-D", + "--delete-sqlite", + action="store_true", + help="Delete the output SQLite before starting operations", + ) argparser.add_argument( "-b", "--bucket", @@ -64,16 +63,16 @@ log_fmtr = logging.Formatter(LOG_FORMAT, datefmt=LOG_DATE_FORMAT) log_stdout = logging.StreamHandler(sys.stdout) log_stdout.setFormatter(log_fmtr) - log_level = logging.DEBUG if args.debug else logging.INFO logger = logging.getLogger("importer") - logger.setLevel(level=log_level) + logger.setLevel(level=logging.DEBUG) logger.addHandler(log_stdout) importer = Importer( - logger=logger, wikicomma_directory=args.wikicomma_directory, sqlite_path=args.sqlite_path, + delete_sqlite=args.delete_sqlite, + s3_bucket=args.s3_bucket, aws_profile=args.aws_profile, ) importer.run() diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index f9823111ef..8aa991e54f 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -1,11 +1,21 @@ import os +import logging import sqlite3 +from .utils import from_js_timestamp + +logger = logging.getLogger("importer") + class Database: __slots__ = ("conn",) - def __init__(self, db_url: str) -> None: + def __init__(self, db_url: str, delete: bool = False) -> None: + if delete: + if os.path.exists(db_url): + logger.debug("Deleting previous SQLite at %s", db_url) + os.remove(db_url) + self.conn = sqlite3.connect(db_url) def seed(self) -> None: @@ -14,5 +24,50 @@ def seed(self) -> None: with open(seed_path) as file: self.conn.executescript(file.read()) + def add_user_block(self, block: dict) -> None: + logger.debug("Found %d users in block", len(block)) + + with self.conn as cur: + # key is redundant, string of user ID + for data in block.values(): + self.add_user(cur, data) + + def add_user(self, cur, data: dict) -> None: + cur.execute( + """ + INSERT INTO user + ( + user_slug, + user_name, + user_id, + user_since, + account_type, + karma, + fetched_at, + real_name, + gender, + birthday, + location, + website + ) + VALUES + (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + data["username"], # slug (e.g. foo-bar) + data["full_name"], # name (e.g. Foo Bar) + data["user_id"], + data["wikidot_user_since"], + data["account_type"], + data["activity"], + data["fetched_at"] // 1000, + data.get("real_name"), + data.get("gender"), + from_js_timestamp(data.get("birthday")), + data.get("location"), + data.get("website"), + ), + ) + def close(self) -> None: self.conn.close() diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index 11b0cb02ae..54b9cb8855 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -1,11 +1,15 @@ import glob import hashlib +import json +import logging import os from .database import Database import boto3 +logger = logging.getLogger("importer") + class Importer: __slots__ = ( @@ -21,15 +25,14 @@ class Importer: def __init__( self, *, - logger, wikicomma_directory, sqlite_path, + delete_sqlite, aws_profile, s3_bucket, ) -> None: - self.logger = logger self.wikicomma_directory = wikicomma_directory - self.database = Database(sqlite_path) + self.database = Database(sqlite_path, delete=delete_sqlite) self.aws_profile = aws_profile self.boto_session = boto3.Session(profile_name=aws_profile) self.s3_client = self.boto_session.client("s3") @@ -51,11 +54,11 @@ def upload_file(self, file_path: str) -> None: s3_path = hashlib.sha256(data).hexdigest() if not data: - self.logger.debug("Skipping upload of empty S3 object") + logger.debug("Skipping upload of empty S3 object") elif self.s3_object_exists(s3_path): - self.logger.debug("S3 object %s already exists", s3_path) + logger.debug("S3 object %s already exists", s3_path) else: - self.logger.info("Uploading S3 object %s (len %d)", s3_path, len(data)) + logger.info("Uploading S3 object %s (len %d)", s3_path, len(data)) self.s3_client.upload_file( Bucket=self.s3_bucket, Key=s3_path, @@ -67,7 +70,7 @@ def data_dir(self, subdirectory: str) -> str: return os.path.join(self.wikicomma_directory, subdirectory) def run(self) -> None: - self.logger.info("Starting Wikicomma importer...") + logger.info("Starting Wikicomma importer...") self.database.seed() self.process_users() @@ -76,9 +79,13 @@ def run(self) -> None: def close(self) -> None: self.database.close() - def process_users(self): - self.logger.info("Processing users...") + def process_users(self) -> None: + logger.info("Processing users...") directory = self.data_dir("_users") for path in glob.iglob(f"{directory}/*.json"): - self.logger.debug("+ {path}") + logger.debug("Reading %s", path) + with open(path) as file: + data = json.load(file) + + self.database.add_user_block(data) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 6e60dd43ae..da2b888bb5 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -1,8 +1,37 @@ CREATE TABLE blob ( + hex_hash TEXT PRIMARY KEY, + length INTEGER NOT NULL +); + +CREATE TABLE site ( + site_slug TEXT PRIMARY KEY +); + +CREATE TABLE page ( + site_slug TEXT NOT NULL REFERENCES site(site_slug), + page_slug TEXT NOT NULL, + + PRIMARY KEY (site_slug, page_slug) ); CREATE TABLE file ( + + site_slug TEXT NOT NULL REFERENCES site(site_slug), + page_slug TEXT NOT NULL, + + FOREIGN KEY (site_slug, page_slug) REFERENCES page(site_slug, page_slug) ); CREATE TABLE user ( + user_slug TEXT PRIMARY KEY, + user_id INTEGER NOT NULL, + user_since INTEGER NOT NULL, + account_type TEXT NOT NULL, + karma INTEGER NOT NULL, + fetched_at INTEGER NOT NULL, + real_name TEXT, + gender TEXT + birthday INTEGER, + location TEXT, + website TEXT ); diff --git a/deepwell/importer/utils.py b/deepwell/importer/utils.py new file mode 100644 index 0000000000..e6f7bda7cc --- /dev/null +++ b/deepwell/importer/utils.py @@ -0,0 +1,8 @@ +from typing import Optional + + +def from_js_timestamp(value: Optional[int]) -> Optional[int]: + if value is None: + return None + else: + return value // 1000 From 6f56464a8aca20acbef876ae0ba8e42bd9271493 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 10:09:36 -0400 Subject: [PATCH 009/133] Update user ingestion code. --- deepwell/importer/database.py | 13 +++++++++++-- deepwell/importer/importer.py | 7 ++++++- deepwell/importer/seed.sql | 5 +++-- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 8aa991e54f..479e284f3c 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -24,8 +24,8 @@ def seed(self) -> None: with open(seed_path) as file: self.conn.executescript(file.read()) - def add_user_block(self, block: dict) -> None: - logger.debug("Found %d users in block", len(block)) + def add_user_block(self, block: dict, filename: str) -> None: + logger.info("Found %d users in block '%s'", len(block), filename) with self.conn as cur: # key is redundant, string of user ID @@ -33,6 +33,13 @@ def add_user_block(self, block: dict) -> None: self.add_user(cur, data) def add_user(self, cur, data: dict) -> None: + logger.info( + "Inserting user '%s' (%s, %d)", + data["full_name"], + data["username"], + data["user_id"], + ) + cur.execute( """ INSERT INTO user @@ -52,6 +59,8 @@ def add_user(self, cur, data: dict) -> None: ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT + DO NOTHING """, ( data["username"], # slug (e.g. foo-bar) diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index 54b9cb8855..34015a2702 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -88,4 +88,9 @@ def process_users(self) -> None: with open(path) as file: data = json.load(file) - self.database.add_user_block(data) + filename = os.path.basename(path) + if filename == "pending.json": + logger.debug("Skipping pending user list") + continue + + self.database.add_user_block(data, filename) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index da2b888bb5..04362b7efd 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -24,13 +24,14 @@ CREATE TABLE file ( CREATE TABLE user ( user_slug TEXT PRIMARY KEY, - user_id INTEGER NOT NULL, + user_name TEXT NOT NULL, + user_id INTEGER NOT NULL UNIQUE, user_since INTEGER NOT NULL, account_type TEXT NOT NULL, karma INTEGER NOT NULL, fetched_at INTEGER NOT NULL, real_name TEXT, - gender TEXT + gender TEXT, birthday INTEGER, location TEXT, website TEXT From 07f3ea6fab79642982397eea3b8706e581db4a66 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 10:47:49 -0400 Subject: [PATCH 010/133] Start separate classes for S3 and SiteImporter. --- deepwell/importer/__main__.py | 2 +- deepwell/importer/database.py | 2 +- deepwell/importer/importer.py | 76 ++++++++++++----------------------- deepwell/importer/s3.py | 48 ++++++++++++++++++++++ deepwell/importer/site.py | 14 +++++++ 5 files changed, 90 insertions(+), 52 deletions(-) create mode 100644 deepwell/importer/s3.py create mode 100644 deepwell/importer/site.py diff --git a/deepwell/importer/__main__.py b/deepwell/importer/__main__.py index f3c046b9fe..0dcf8f003c 100644 --- a/deepwell/importer/__main__.py +++ b/deepwell/importer/__main__.py @@ -64,7 +64,7 @@ log_stdout = logging.StreamHandler(sys.stdout) log_stdout.setFormatter(log_fmtr) - logger = logging.getLogger("importer") + logger = logging.getLogger(__package__) logger.setLevel(level=logging.DEBUG) logger.addHandler(log_stdout) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 479e284f3c..dda55f6a24 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -4,7 +4,7 @@ from .utils import from_js_timestamp -logger = logging.getLogger("importer") +logger = logging.getLogger(__name__) class Database: diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index 34015a2702..b02ca1900c 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -1,14 +1,12 @@ -import glob import hashlib import json import logging import os from .database import Database +from .s3 import S3 -import boto3 - -logger = logging.getLogger("importer") +logger = logging.getLogger(__name__) class Importer: @@ -16,10 +14,7 @@ class Importer: "logger", "wikicomma_directory", "database", - "aws_profile", - "boto_session", - "s3_client", - "s3_bucket", + "s3", ) def __init__( @@ -33,48 +28,14 @@ def __init__( ) -> None: self.wikicomma_directory = wikicomma_directory self.database = Database(sqlite_path, delete=delete_sqlite) - self.aws_profile = aws_profile - self.boto_session = boto3.Session(profile_name=aws_profile) - self.s3_client = self.boto_session.client("s3") - self.s3_bucket = s3_bucket - - def s3_object_exists(self, s3_path: str) -> bool: - try: - self.s3_client.head_object( - Bucket=self.s3_bucket, - Key=s3_path, - ) - return True - except: - return False - - def upload_file(self, file_path: str) -> None: - with open(path, "rb") as file: - data = file.read() - s3_path = hashlib.sha256(data).hexdigest() - - if not data: - logger.debug("Skipping upload of empty S3 object") - elif self.s3_object_exists(s3_path): - logger.debug("S3 object %s already exists", s3_path) - else: - logger.info("Uploading S3 object %s (len %d)", s3_path, len(data)) - self.s3_client.upload_file( - Bucket=self.s3_bucket, - Key=s3_path, - Body=data, - ContentLength=len(data), - ) - - def data_dir(self, subdirectory: str) -> str: - return os.path.join(self.wikicomma_directory, subdirectory) + self.s3 = S3(aws_profile=aws_profile, bucket=s3_bucket) def run(self) -> None: logger.info("Starting Wikicomma importer...") self.database.seed() self.process_users() - ... + self.process_sites() def close(self) -> None: self.database.close() @@ -82,15 +43,30 @@ def close(self) -> None: def process_users(self) -> None: logger.info("Processing users...") - directory = self.data_dir("_users") - for path in glob.iglob(f"{directory}/*.json"): + directory = os.path.join(self.wikicomma_directory, "_users") + for filename in os.listdir(directory): + if filename == "pending.json": + logger.debug("Skipping pending user list") + continue + + path = os.path.join(directory, filename) logger.debug("Reading %s", path) with open(path) as file: data = json.load(file) - filename = os.path.basename(path) - if filename == "pending.json": - logger.debug("Skipping pending user list") + self.database.add_user_block(data, filename) + + def process_sites(self) -> None: + logger.info("Processing sites...") + + for site_descr in os.listdir(self.wikicomma_directory): + if site_descr == "_users": + logger.debug("Skipping user list") continue - self.database.add_user_block(data, filename) + # NOTE: site_descr != site_slug + self.process_site(site_descr) + + def process_site(self, site_descr: str) -> None: + logger.info("Processing site '%s'...", site_descr) + directory = os.path.join(self.wikicomma_directory, site_descr) diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py new file mode 100644 index 0000000000..5cb4f21db9 --- /dev/null +++ b/deepwell/importer/s3.py @@ -0,0 +1,48 @@ +import logging + +import boto3 + +logger = logging.getLogger(__name__) + + +class S3: + __slots__ = ( + "aws_profile", + "session", + "client", + "bucket", + ) + + def __init__(self, *, aws_profile, bucket) -> None: + self.aws_profile = aws_profile + self.session = boto3.Session(profile_name=aws_profile) + self.client = self.session.client("s3") + self.bucket = bucket + + def exists(self, s3_path: str) -> bool: + try: + self.s3_client.head_object( + Bucket=self.s3_bucket, + Key=s3_path, + ) + return True + except: + return False + + def upload(self, file_path: str) -> None: + with open(path, "rb") as file: + data = file.read() + s3_path = hashlib.sha256(data).hexdigest() + + if not data: + logger.debug("Skipping upload of empty S3 object") + elif self.exists(s3_path): + logger.debug("S3 object %s already exists", s3_path) + else: + logger.info("Uploading S3 object %s (len %d)", s3_path, len(data)) + self.s3_client.upload_file( + Bucket=self.s3_bucket, + Key=s3_path, + Body=data, + ContentLength=len(data), + ) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py new file mode 100644 index 0000000000..5bd9c26d86 --- /dev/null +++ b/deepwell/importer/site.py @@ -0,0 +1,14 @@ +import logging + +logger = logging.getLogger(__name__) + + +class SiteImporter: + __slots__ = ( + "database", + "site_descr", + "site_slug", + ) + + def __init__(self, ...) -> None: + ... From 85e40e9f2b8c1a5398266342ac38a9b724147a3f Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 11:07:29 -0400 Subject: [PATCH 011/133] Add site data ingestion. --- deepwell/importer/__main__.py | 13 +++++++++- deepwell/importer/database.py | 34 ++++++++++++++++++++++++ deepwell/importer/importer.py | 6 +++++ deepwell/importer/seed.sql | 4 ++- deepwell/importer/wikicomma_config.py | 37 +++++++++++++++++++++++++++ 5 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 deepwell/importer/wikicomma_config.py diff --git a/deepwell/importer/__main__.py b/deepwell/importer/__main__.py index 0dcf8f003c..fff7b0c28a 100644 --- a/deepwell/importer/__main__.py +++ b/deepwell/importer/__main__.py @@ -6,6 +6,7 @@ import sys from .importer import Importer +from .wikicomma_config import parse_config LOG_FORMAT = "[%(levelname)s] [%(asctime)s] %(message)s" LOG_DATE_FORMAT = "%Y/%m/%d %H:%M:%S" @@ -20,13 +21,20 @@ action="store_false", help="Don't output to standard out", ) + argparser.add_argument( + "-c", + "--config", + dest="wikicomma_config", + required=True, + help="The configuration JSON that Wikicomma uses", + ) argparser.add_argument( "-d", "--directory", "--wikicomma-directory", dest="wikicomma_directory", required=True, - help="The directory where WikiComma data resides", + help="The directory where Wikicomma data resides", ) argparser.add_argument( "-o", @@ -68,7 +76,10 @@ logger.setLevel(level=logging.DEBUG) logger.addHandler(log_stdout) + wikicomma_config = parse_config(args.wikicomma_config) + importer = Importer( + wikicomma_config=wikicomma_config, wikicomma_directory=args.wikicomma_directory, sqlite_path=args.sqlite_path, delete_sqlite=args.delete_sqlite, diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index dda55f6a24..cc3b7cce03 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -2,6 +2,7 @@ import logging import sqlite3 +from .wikicomma_config import SiteData from .utils import from_js_timestamp logger = logging.getLogger(__name__) @@ -32,6 +33,39 @@ def add_user_block(self, block: dict, filename: str) -> None: for data in block.values(): self.add_user(cur, data) + def add_site(self, data: SiteData) -> None: + logger.info( + "Inserting site '%s' (%s)", + data.descr, + data.slug, + ) + + with self.conn as cur: + cur.execute( + """ + INSERT INTO site + ( + site_slug, + site_descr, + site_url, + ) + VALUES + (?, ?, ?) + ON CONFLICT + DO UPDATE + SET + site_descr = ?, + site_url = ? + """, + ( + data.slug, + data.descr, + data.url, + data.descr, + data.url, + ), + ) + def add_user(self, cur, data: dict) -> None: logger.info( "Inserting user '%s' (%s, %d)", diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index b02ca1900c..5b46ad3624 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -12,6 +12,7 @@ class Importer: __slots__ = ( "logger", + "wikicomma_config", "wikicomma_directory", "database", "s3", @@ -20,12 +21,14 @@ class Importer: def __init__( self, *, + wikicomma_config, wikicomma_directory, sqlite_path, delete_sqlite, aws_profile, s3_bucket, ) -> None: + self.wikicomma_config = wikicomma_config self.wikicomma_directory = wikicomma_directory self.database = Database(sqlite_path, delete=delete_sqlite) self.s3 = S3(aws_profile=aws_profile, bucket=s3_bucket) @@ -70,3 +73,6 @@ def process_sites(self) -> None: def process_site(self, site_descr: str) -> None: logger.info("Processing site '%s'...", site_descr) directory = os.path.join(self.wikicomma_directory, site_descr) + + site_data = self.wikicomma_config.sites[site_descr] + self.database.add_site(site_data) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 04362b7efd..85494a8106 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -4,7 +4,9 @@ CREATE TABLE blob ( ); CREATE TABLE site ( - site_slug TEXT PRIMARY KEY + site_slug TEXT PRIMARY KEY, + site_descr TEXT NOT NULL, -- Wikicomma name + site_url TEXT NOT NULL ); CREATE TABLE page ( diff --git a/deepwell/importer/wikicomma_config.py b/deepwell/importer/wikicomma_config.py new file mode 100644 index 0000000000..2eadc155c3 --- /dev/null +++ b/deepwell/importer/wikicomma_config.py @@ -0,0 +1,37 @@ +import json +import logging +import re +from collections import namedtuple + +WIKIDOT_SITE_REGEX = re.compile(r"https?:\/\/([^\.]+)\.wikidot\.com\/?") + +WikicommaConfig = namedtuple("WikicommaConfig", ("sites",)) +SiteData = namedtuple("SiteData", ("descr", "slug", "url")) + +logger = logging.getLogger(__name__) + + +def parse_config(path: str) -> WikicommaConfig: + with open(path) as file: + data = json.load(file) + + sites = {} + logger.info("Found sites:") + for pair in data["wikis"]: + descr = pair["name"] + url = pair["url"] + + match = WIKIDOT_SITE_REGEX.match(url) + if match is None: + logger.error("Cannot parse site URL: %s", url) + raise ValueError(url) + slug = match[1] + logger.info("* %s ('%s')", slug, descr) + + sites[descr] = SiteData( + descr=descr, + slug=slug, + url=url, + ) + + return WikicommaConfig(sites=sites) From d269cf61d68e7def0f0fdd6af9b9dc6cd80def70 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 11:24:39 -0400 Subject: [PATCH 012/133] Start work on SiteImporter class. --- deepwell/importer/database.py | 19 ++++++++------- deepwell/importer/importer.py | 10 +++++++- deepwell/importer/s3.py | 2 +- deepwell/importer/seed.sql | 3 ++- deepwell/importer/site.py | 45 ++++++++++++++++++++++++++++++++++- 5 files changed, 66 insertions(+), 13 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index cc3b7cce03..7c40b8c1e5 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -33,11 +33,12 @@ def add_user_block(self, block: dict, filename: str) -> None: for data in block.values(): self.add_user(cur, data) - def add_site(self, data: SiteData) -> None: + def add_site(self, *, slug: str, descr: str, url: str, id: int) -> None: logger.info( - "Inserting site '%s' (%s)", - data.descr, - data.slug, + "Inserting site '%s' (%s, %d)", + descr, + slug, + id, ) with self.conn as cur: @@ -58,11 +59,11 @@ def add_site(self, data: SiteData) -> None: site_url = ? """, ( - data.slug, - data.descr, - data.url, - data.descr, - data.url, + slug, + descr, + url, + descr, + url, ), ) diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index 5b46ad3624..63e1b2aaf5 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -5,6 +5,7 @@ from .database import Database from .s3 import S3 +from .site import SiteImporter logger = logging.getLogger(__name__) @@ -75,4 +76,11 @@ def process_site(self, site_descr: str) -> None: directory = os.path.join(self.wikicomma_directory, site_descr) site_data = self.wikicomma_config.sites[site_descr] - self.database.add_site(site_data) + site_importer = SiteImporter( + directory=directory, + database=self.database, + site_descr=site_data.descr, + site_slug=site_data.slug, + site_url=site_data.url, + ) + site_importer.run() diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py index 5cb4f21db9..c21bc12811 100644 --- a/deepwell/importer/s3.py +++ b/deepwell/importer/s3.py @@ -40,7 +40,7 @@ def upload(self, file_path: str) -> None: logger.debug("S3 object %s already exists", s3_path) else: logger.info("Uploading S3 object %s (len %d)", s3_path, len(data)) - self.s3_client.upload_file( + self.client.upload_file( Bucket=self.s3_bucket, Key=s3_path, Body=data, diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 85494a8106..683126d78a 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -6,7 +6,8 @@ CREATE TABLE blob ( CREATE TABLE site ( site_slug TEXT PRIMARY KEY, site_descr TEXT NOT NULL, -- Wikicomma name - site_url TEXT NOT NULL + site_url TEXT NOT NULL, + site_id INTEGER NOT NULL ); CREATE TABLE page ( diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 5bd9c26d86..7f35236c76 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -1,14 +1,57 @@ import logging +import re +from urllib.request import urlopen + +from .database import Database + +SITE_ID_REGEX = re.compile(r"WIKIREQUEST\.info\.siteId = (\d+);") logger = logging.getLogger(__name__) class SiteImporter: __slots__ = ( + "directory", "database", "site_descr", "site_slug", + "site_url", + "site_id", ) - def __init__(self, ...) -> None: + def __init__( + self, + *, + directory: str, + database: Database, + site_descr: str, + site_slug: str, + site_url: str, + ) -> None: + self.site_descr = site_descr + self.site_slug = site_slug + self.site_url = site_url + self.site_id = self.get_site_id(site_url) + + @staticmethod + def get_site_id(site_url: str) -> int: + logger.info("Downloading web page %s to scrape site ID", site_url) + + with urlopen(site_url) as file: + html = file.read().decode("utf-8") + + match = SITE_ID_REGEX.find(html) + if match is None: + logger.error("Unable to find site ID in HTML") + raise ValueError(site_url) + + return int(match[1]) + + def run(self) -> None: + self.database.add_site( + slug=self.site_slug, + descr=self.site_descr, + url=self.site_url, + id=self.site_id, + ) ... From 6b104585515399df4b364e20725cb4a3683b917a Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 11:34:12 -0400 Subject: [PATCH 013/133] Start site subdirectories. --- deepwell/importer/site.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 7f35236c76..a65e582e50 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -47,6 +47,22 @@ def get_site_id(site_url: str) -> int: return int(match[1]) + @property + def file_dir(self) -> str: + return os.path.join(self.directory, "files") + + @property + def forum_dir(self) -> str: + return os.path.join(self.directory, "forum") + + @property + def page_dir(self) -> str: + return os.path.join(self.directory, "pages") + + @property + def meta_path(self, path: str) -> str: + return os.path.join(self.directory, "meta", path) + def run(self) -> None: self.database.add_site( slug=self.site_slug, @@ -55,3 +71,9 @@ def run(self) -> None: id=self.site_id, ) ... + + def process_files(self) -> None: + ... + + def process_forum(self) -> None: + ... From 005897debecad7df4cdb20908e42b40654641b10 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 11:47:40 -0400 Subject: [PATCH 014/133] Add process_pages() stub. --- deepwell/importer/site.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index a65e582e50..c9e2746670 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -72,6 +72,9 @@ def run(self) -> None: ) ... + def process_pages(self) -> None: + ... + def process_files(self) -> None: ... From 1cab8795e7a8efae526b116309e3ff20087962fb Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 12:04:32 -0400 Subject: [PATCH 015/133] Add page ID mapping processing. --- deepwell/importer/database.py | 27 +++++++++++++++++++++++++-- deepwell/importer/seed.sql | 5 +++-- deepwell/importer/site.py | 15 +++++++++++++++ 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 7c40b8c1e5..3975150fa8 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -25,6 +25,9 @@ def seed(self) -> None: with open(seed_path) as file: self.conn.executescript(file.read()) + def close(self) -> None: + self.conn.close() + def add_user_block(self, block: dict, filename: str) -> None: logger.info("Found %d users in block '%s'", len(block), filename) @@ -113,5 +116,25 @@ def add_user(self, cur, data: dict) -> None: ), ) - def close(self) -> None: - self.conn.close() + def add_page(self, cur, *, page_id: int, site_slug: str, page_slug: str) -> None: + logger.info("Inserting page '%s' (%d)", page_slug, page_id) + + cur.execute( + """ + INSERT INTO page + ( + page_id, + site_slug, + page_slug + ) + VALUES + (?, ?, ?) + ON CONFLICT + DO NOTHING + """, + ( + page_id, + site_slug, + page_slug, + ), + ) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 683126d78a..a67b1d41f1 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -11,14 +11,15 @@ CREATE TABLE site ( ); CREATE TABLE page ( + page_id INTEGER PRIMARY KEY, site_slug TEXT NOT NULL REFERENCES site(site_slug), page_slug TEXT NOT NULL, - PRIMARY KEY (site_slug, page_slug) + UNIQUE (site_slug, page_slug) ); CREATE TABLE file ( - + file_id INTEGER PRIMARY KEY, site_slug TEXT NOT NULL REFERENCES site(site_slug), page_slug TEXT NOT NULL, diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index c9e2746670..9d4dfa85d0 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -1,5 +1,6 @@ import logging import re +from typing import Union from urllib.request import urlopen from .database import Database @@ -63,6 +64,10 @@ def page_dir(self) -> str: def meta_path(self, path: str) -> str: return os.path.join(self.directory, "meta", path) + def json(self, path: str) -> Union[list, dict]: + with open(path) as file: + return json.load(file) + def run(self) -> None: self.database.add_site( slug=self.site_slug, @@ -73,8 +78,18 @@ def run(self) -> None: ... def process_pages(self) -> None: + self.process_page_ids() ... + def process_page_ids(self) -> None: + logger.info("Ingesting page ID mappings for site %s", self.site_slug) + mapping = self.json(self.meta_path("page_id_map.json")) + with self.database.conn as cur: + for id_str, page_slug in mapping.items(): + logger.debug("Found page '%s' (%d)", page_slug, id_str) + id = int(id_str) + self.database.add_page(cur, + def process_files(self) -> None: ... From 53f82866b4895cbe613f6eca9e9b8eb6ae6bdc6a Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 12:27:23 -0400 Subject: [PATCH 016/133] Add page data. --- deepwell/importer/site.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 9d4dfa85d0..b606efe670 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -75,10 +75,13 @@ def run(self) -> None: url=self.site_url, id=self.site_id, ) + self.process_pages() + # TODO ... def process_pages(self) -> None: self.process_page_ids() + # TODO ... def process_page_ids(self) -> None: @@ -88,10 +91,12 @@ def process_page_ids(self) -> None: for id_str, page_slug in mapping.items(): logger.debug("Found page '%s' (%d)", page_slug, id_str) id = int(id_str) - self.database.add_page(cur, + self.database.add_page(cur, site_slug=self.site_slug, page_slug=page_slug, page_id=id) def process_files(self) -> None: + # TODO ... def process_forum(self) -> None: + # TODO ... From a6f9b6336e0db56ecfda15989ac119f29b24cfe0 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 12:35:00 -0400 Subject: [PATCH 017/133] Change logging. --- deepwell/importer/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/__main__.py b/deepwell/importer/__main__.py index fff7b0c28a..b1cc28c651 100644 --- a/deepwell/importer/__main__.py +++ b/deepwell/importer/__main__.py @@ -8,7 +8,7 @@ from .importer import Importer from .wikicomma_config import parse_config -LOG_FORMAT = "[%(levelname)s] [%(asctime)s] %(message)s" +LOG_FORMAT = "[%(levelname)s] %(asctime)s %(name)s: %(message)s" LOG_DATE_FORMAT = "%Y/%m/%d %H:%M:%S" if __name__ == "__main__": From 44708515d901618d7c1613030229856005b34feb Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 12:35:05 -0400 Subject: [PATCH 018/133] Fix regex execution. --- deepwell/importer/site.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index b606efe670..bef1ec7f2f 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -41,7 +41,7 @@ def get_site_id(site_url: str) -> int: with urlopen(site_url) as file: html = file.read().decode("utf-8") - match = SITE_ID_REGEX.find(html) + match = SITE_ID_REGEX.search(html) if match is None: logger.error("Unable to find site ID in HTML") raise ValueError(site_url) From 3f14825be5eb2b5782e1124ad25590468812d1e2 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 12:46:10 -0400 Subject: [PATCH 019/133] Fix init. --- deepwell/importer/site.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index bef1ec7f2f..b012c535c1 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -29,6 +29,8 @@ def __init__( site_slug: str, site_url: str, ) -> None: + self.directory = directory + self.database = database self.site_descr = site_descr self.site_slug = site_slug self.site_url = site_url From 846bdae0aa8a924af755fa29809ef503b026ebcc Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 12:49:47 -0400 Subject: [PATCH 020/133] Run black formatter. --- deepwell/importer/site.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index b012c535c1..96678a1d0b 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -93,7 +93,12 @@ def process_page_ids(self) -> None: for id_str, page_slug in mapping.items(): logger.debug("Found page '%s' (%d)", page_slug, id_str) id = int(id_str) - self.database.add_page(cur, site_slug=self.site_slug, page_slug=page_slug, page_id=id) + self.database.add_page( + cur, + site_slug=self.site_slug, + page_slug=page_slug, + page_id=id, + ) def process_files(self) -> None: # TODO From 6f434595375786160d56aa7577d37fa9f8a5dd18 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 12:55:33 -0400 Subject: [PATCH 021/133] Fix add_site(). --- deepwell/importer/database.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 3975150fa8..69c8a0eea1 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -50,11 +50,12 @@ def add_site(self, *, slug: str, descr: str, url: str, id: int) -> None: INSERT INTO site ( site_slug, + site_id, site_descr, - site_url, + site_url ) VALUES - (?, ?, ?) + (?, ?, ?, ?) ON CONFLICT DO UPDATE SET @@ -63,6 +64,7 @@ def add_site(self, *, slug: str, descr: str, url: str, id: int) -> None: """, ( slug, + id, descr, url, descr, From 4615453643033598c38f36aa0fdc5963d80c94a2 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 12:55:54 -0400 Subject: [PATCH 022/133] Fix decorators. --- deepwell/importer/site.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 96678a1d0b..7d22db16b8 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -50,19 +50,15 @@ def get_site_id(site_url: str) -> int: return int(match[1]) - @property def file_dir(self) -> str: return os.path.join(self.directory, "files") - @property def forum_dir(self) -> str: return os.path.join(self.directory, "forum") - @property def page_dir(self) -> str: return os.path.join(self.directory, "pages") - @property def meta_path(self, path: str) -> str: return os.path.join(self.directory, "meta", path) From 52c2053476a146b82cf1ad84cd4fb8e14c73435e Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 13:02:03 -0400 Subject: [PATCH 023/133] Add missing import. --- deepwell/importer/site.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 7d22db16b8..5c51982d5e 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -1,4 +1,5 @@ import logging +import os import re from typing import Union from urllib.request import urlopen From afea615a0b6ab52ee6b610e4e45e1936b08bab98 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 13:02:40 -0400 Subject: [PATCH 024/133] Add another missing import. --- deepwell/importer/site.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 5c51982d5e..a22d155a87 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -1,3 +1,4 @@ +import json import logging import os import re From 287a9b93e3dabc382c5406487f6de98eb5c9b040 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 13 Jun 2024 13:03:30 -0400 Subject: [PATCH 025/133] Fix format string. --- deepwell/importer/site.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index a22d155a87..f374a72674 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -89,7 +89,7 @@ def process_page_ids(self) -> None: mapping = self.json(self.meta_path("page_id_map.json")) with self.database.conn as cur: for id_str, page_slug in mapping.items(): - logger.debug("Found page '%s' (%d)", page_slug, id_str) + logger.debug("Found page '%s' (%s)", page_slug, id_str) id = int(id_str) self.database.add_page( cur, From 93e73fcecd40a1ac84a7efa5f17cd06e3e4ec815 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sun, 16 Jun 2024 16:25:45 -0400 Subject: [PATCH 026/133] Add requirements.txt for importer. --- deepwell/importer/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 deepwell/importer/requirements.txt diff --git a/deepwell/importer/requirements.txt b/deepwell/importer/requirements.txt new file mode 100644 index 0000000000..946f3abb60 --- /dev/null +++ b/deepwell/importer/requirements.txt @@ -0,0 +1,2 @@ +boto3>=1.34.0 +py7zr>=0.21.0 From 8b36ed2fe33554d7df765532a8fe02f4a3809ca7 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 00:47:43 -0400 Subject: [PATCH 027/133] Skip torrent files. --- deepwell/importer/importer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index 63e1b2aaf5..17171b63fc 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -67,6 +67,9 @@ def process_sites(self) -> None: if site_descr == "_users": logger.debug("Skipping user list") continue + elif site_descr.endswith(".torrent"): + logger.debug("Skipping torrent file from Wikicomma sync") + continue # NOTE: site_descr != site_slug self.process_site(site_descr) From caca64140edbb4ea8bef1463e4b7152220406386 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 00:56:24 -0400 Subject: [PATCH 028/133] Cache site ID (expensive get). --- deepwell/importer/site.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index f374a72674..90fad93adc 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -2,6 +2,7 @@ import logging import os import re +from functools import cache from typing import Union from urllib.request import urlopen @@ -39,6 +40,7 @@ def __init__( self.site_id = self.get_site_id(site_url) @staticmethod + @cache def get_site_id(site_url: str) -> int: logger.info("Downloading web page %s to scrape site ID", site_url) From 69354621bcb262dd22dedacd90c8a0a070dcd644 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 20:13:42 -0400 Subject: [PATCH 029/133] Fetch site ID from database if present. Avoid web downloads if already done. --- deepwell/importer/site.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 90fad93adc..f9067a3c69 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -39,9 +39,22 @@ def __init__( self.site_url = site_url self.site_id = self.get_site_id(site_url) - @staticmethod @cache - def get_site_id(site_url: str) -> int: + def get_site_id(self, site_url: str) -> int: + with self.database.conn as cur: + result = cur.execute( + """ + SELECT site_id FROM site + WHERE site_url = ? + """, + (site_url,), + ).fetchone() + + if result is not None: + site_id = result[0] + logger.debug("Found site ID for URL %s: %d", site_url, site_id) + return site_id + logger.info("Downloading web page %s to scrape site ID", site_url) with urlopen(site_url) as file: From 1259a042ba683cd83734b51e7a54d52caf0de413 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 20:27:36 -0400 Subject: [PATCH 030/133] Add site to page log. --- deepwell/importer/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 69c8a0eea1..15c8d13e0a 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -119,7 +119,7 @@ def add_user(self, cur, data: dict) -> None: ) def add_page(self, cur, *, page_id: int, site_slug: str, page_slug: str) -> None: - logger.info("Inserting page '%s' (%d)", page_slug, page_id) + logger.info("Inserting into site '%s' page '%s' (%d)", site_slug, page_slug, page_id) cur.execute( """ From a37a740a5e4bd3ebf11197e9231f41a88e6ed956 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 22:42:48 -0400 Subject: [PATCH 031/133] Start implementation for page metadata. --- deepwell/importer/database.py | 80 +++++++++++++++++++++++++++++++++++ deepwell/importer/seed.sql | 28 ++++++++++++ deepwell/importer/site.py | 63 ++++++++++++++++++++++++++- 3 files changed, 170 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 15c8d13e0a..0f3a679410 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -1,4 +1,5 @@ import os +import json import logging import sqlite3 @@ -140,3 +141,82 @@ def add_page(self, cur, *, page_id: int, site_slug: str, page_slug: str) -> None page_slug, ), ) + + def add_page_metadata(self, cur, page_id: int, metadata: dict) -> None: + logger.info("Inserting page metadata for page ID %d", page_id) + + cur.execute( + """ + INSERT INTO page_metadata + ( + page_id, + sitemap_updated_at, + title, + locked, + tags + ) + VALUES + (?, ?, ?, ?, ?) + ON CONFLICT + DO NOTHING + """, + ( + metadata["page_id"], + metadata["sitemap_update"] // 1000, + metadata["title"], + metadata["is_locked"], + json.dumps(metadata["tags"]), + ), + ) + + def add_page_revision(self, cur, page_id: int, data: dict) -> None: + logger.info("Inserting page revision %d for page ID %d", data["revision"], page_id) + + cur.execute( + """ + INSERT INTO page_revision + ( + revision_id, + revision_number, + page_id, + user_id, + created_at, + flags, + comments + ) + """, + ( + data["global_revision"], + data["revision"], + page_id, + data["author"], + data["stamp"], + data["flags"], + data["commentary"], + ), + ) + + def add_page_vote(self, cur, *, page_id: int, user_id: int, vote_value: int) -> None: + logger.info("Inserting page vote for page ID %d / user ID %d (value %d)", page_id, user_id, vote_value) + + cur.execute( + """ + INSERT INTO page_vote + ( + page_id, + user_id, + value + ) + VALUES + (?, ?, ?), + ON CONFLICT + DO UPDATE + SET value = ? + """, + ( + page_id, + user_id, + vote_value, + vote_value, + ), + ) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index a67b1d41f1..6b3750ad85 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -18,6 +18,34 @@ CREATE TABLE page ( UNIQUE (site_slug, page_slug) ); +CREATE TABLE page_metadata ( + page_id INTEGER PRIMARY KEY REFERENCES page(page_id), + sitemap_updated_at INTEGER NOT NULL, + title TEXT NOT NULL, + locked INTEGER NOT NULL CHECK (locked IN (0, 1)), -- boolean + tags TEXT NOT NULL -- JSON +); + +CREATE TABLE page_revision ( + revision_id INTEGER PRIMARY KEY + revision_number INTEGER NOT NULL CHECK (revision_number >= 0), + page_id INTEGER NOT NULL REFERENCES page(page_id), + user_id INTEGER NOT NULL REFERENCES user(user_id), + created_at INTEGER NOT NULL, + flags TEXT NOT NULL, + comments TEXT NOT NULL, + + UNIQUE (page_id, revision_number) +); + +CREATE TABLE page_vote ( + page_id INTEGER REFERENCES page(page_id), + user_id INTEGER REFERENCES user(user_id), + value INTEGER NOT NULL, + + PRIMARY KEY (page_id, user_id) +); + CREATE TABLE file ( file_id INTEGER PRIMARY KEY, site_slug TEXT NOT NULL REFERENCES site(site_slug), diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index f9067a3c69..baa1ee0d97 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -3,7 +3,7 @@ import os import re from functools import cache -from typing import Union +from typing import Tuple, Union from urllib.request import urlopen from .database import Database @@ -21,6 +21,7 @@ class SiteImporter: "site_slug", "site_url", "site_id", + "page_ids", ) def __init__( @@ -38,6 +39,7 @@ def __init__( self.site_slug = site_slug self.site_url = site_url self.site_id = self.get_site_id(site_url) + self.page_ids = {} @cache def get_site_id(self, site_url: str) -> int: @@ -67,6 +69,28 @@ def get_site_id(self, site_url: str) -> int: return int(match[1]) + def get_page_id(self, page_slug: str) -> int: + page_id = self.page_ids.get(page_slug) + if page_id is not None: + return page_id + + with self.database.conn as cur: + result = cur.execute( + """ + SELECT page_id FROM page + WHERE page_slug = ? + AND site_slug = ? + """, + (page_slug, self.site_slug), + ).fetchone() + + if result is not None: + (page_id,) = result + self.page_ids[page_slug] = page_id + return page_id + + raise RuntimeError(f"Cannot find page ID for page '{page_slug}' in site '{self.site_slug}'") + def file_dir(self) -> str: return os.path.join(self.directory, "files") @@ -96,6 +120,7 @@ def run(self) -> None: def process_pages(self) -> None: self.process_page_ids() + self.process_page_metadata() # TODO ... @@ -106,6 +131,7 @@ def process_page_ids(self) -> None: for id_str, page_slug in mapping.items(): logger.debug("Found page '%s' (%s)", page_slug, id_str) id = int(id_str) + self.page_ids[page_slug] = id self.database.add_page( cur, site_slug=self.site_slug, @@ -113,6 +139,41 @@ def process_page_ids(self) -> None: page_id=id, ) + def process_page_metadata(self) -> None: + logger.info("Ingesting page revision metadata for site %s", self.site_slug) + meta_directory = self.meta_path("pages") + for path in os.listdir(meta_directory): + logger.debug("Processing page metadata %s", path) + + page_slug, ext = os.path.splitext(path) + assert ext == ".json", "Extension for page metadata not JSON" + + page_id = self.get_page_id(page_slug) + path = os.path.join(meta_directory, page_slug) + metadata = self.json(path) + assert metadata["page_slug"] == page_slug + assert metadata["page_id"] == page_id + + with self.database.conn as cur: + self.database.add_page_metadata( + cur, + page_id, + metadata, + ) + self.process_page_revisions(cur, page_id, metadata["revisions"]) + self.process_page_votes(cur, page_id, metadata["votings"]) + + def process_page_revisions(self, cur, page_id: int, revisions: list[dict]) -> None: + logger.debug("Ingesting page revision metadata for page ID %d", page_id) + for revision in revisions: + self.database.add_page_revision(cur, page_id, revision) + + def process_page_votes(self, cur, page_id: int, votes: list[Tuple[int, int]]) -> None: + logger.debug("Ingesting page votes for page ID %d", page_id) + for user_id, bool_value in votes: + int_value = 1 if bool_value else -1 + self.database.add_page_vote(cur, user_id=user_id, page_id=page_id, value=int_value) + def process_files(self) -> None: # TODO ... From 6f38f811fdd0cff89b0b8b53dce8848b47862a3e Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 22:52:49 -0400 Subject: [PATCH 032/133] Add method to convert page slugs to add colons. --- deepwell/importer/site.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index baa1ee0d97..5d0be1c9e1 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -41,6 +41,16 @@ def __init__( self.site_id = self.get_site_id(site_url) self.page_ids = {} + @staticmethod + def convert_page_slug(page_slug: str) -> str: + if page_slug.startswith("_"): + # a _default category page that starts with an underscore, e.g. _template + return page_slug + + # replace only the first underscore + # the second (if present) is a special page, like _404 + return re.subn("_", ":", page_slug, 1) + @cache def get_site_id(self, site_url: str) -> int: with self.database.conn as cur: @@ -70,6 +80,7 @@ def get_site_id(self, site_url: str) -> int: return int(match[1]) def get_page_id(self, page_slug: str) -> int: + page_slug = self.convert_page_slug(page_slug) page_id = self.page_ids.get(page_slug) if page_id is not None: return page_id @@ -148,6 +159,7 @@ def process_page_metadata(self) -> None: page_slug, ext = os.path.splitext(path) assert ext == ".json", "Extension for page metadata not JSON" + page_slug = self.convert_page_slug(page_slug) page_id = self.get_page_id(page_slug) path = os.path.join(meta_directory, page_slug) metadata = self.json(path) From ba99a8c209149be6060572b4924f261bc7a2c3d4 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 22:59:29 -0400 Subject: [PATCH 033/133] Fix typo. --- deepwell/importer/seed.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 6b3750ad85..4c8ce2d09c 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -27,7 +27,7 @@ CREATE TABLE page_metadata ( ); CREATE TABLE page_revision ( - revision_id INTEGER PRIMARY KEY + revision_id INTEGER PRIMARY KEY, revision_number INTEGER NOT NULL CHECK (revision_number >= 0), page_id INTEGER NOT NULL REFERENCES page(page_id), user_id INTEGER NOT NULL REFERENCES user(user_id), From 849d032f644617762a8e2a3009c3f75f5ac3254b Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 22:59:35 -0400 Subject: [PATCH 034/133] Handle missing tag list. --- deepwell/importer/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 0f3a679410..e413c8c00a 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -165,7 +165,7 @@ def add_page_metadata(self, cur, page_id: int, metadata: dict) -> None: metadata["sitemap_update"] // 1000, metadata["title"], metadata["is_locked"], - json.dumps(metadata["tags"]), + json.dumps(metadata.get("tags", [])), ), ) From dd7a11a465e2b87cb0a1f259365cc6391a97f963 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 22:59:49 -0400 Subject: [PATCH 035/133] Properly convert page slug. --- deepwell/importer/site.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 5d0be1c9e1..117f16beca 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -49,7 +49,8 @@ def convert_page_slug(page_slug: str) -> str: # replace only the first underscore # the second (if present) is a special page, like _404 - return re.subn("_", ":", page_slug, 1) + converted, _ = re.subn("_", ":", page_slug, 1) + return converted @cache def get_site_id(self, site_url: str) -> int: @@ -80,7 +81,6 @@ def get_site_id(self, site_url: str) -> int: return int(match[1]) def get_page_id(self, page_slug: str) -> int: - page_slug = self.convert_page_slug(page_slug) page_id = self.page_ids.get(page_slug) if page_id is not None: return page_id From 02f72b9921540dff12b1486b43e104f80ccb347f Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 23:00:48 -0400 Subject: [PATCH 036/133] Fix insert query. --- deepwell/importer/database.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index e413c8c00a..621349419a 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -184,6 +184,10 @@ def add_page_revision(self, cur, page_id: int, data: dict) -> None: flags, comments ) + VALUES + (?, ?, ?, ?, ?, ?, ?) + ON CONFLICT + DO NOTHING """, ( data["global_revision"], From 46c8b4f1753bd38d636bd19ff454bd4377ea686b Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 23:01:17 -0400 Subject: [PATCH 037/133] Fix page metadata variables. --- deepwell/importer/site.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 117f16beca..3a2d94cf0f 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -158,13 +158,14 @@ def process_page_metadata(self) -> None: page_slug, ext = os.path.splitext(path) assert ext == ".json", "Extension for page metadata not JSON" + path = os.path.join(meta_directory, path) page_slug = self.convert_page_slug(page_slug) page_id = self.get_page_id(page_slug) - path = os.path.join(meta_directory, page_slug) + metadata = self.json(path) - assert metadata["page_slug"] == page_slug assert metadata["page_id"] == page_id + assert metadata["name"] == page_slug with self.database.conn as cur: self.database.add_page_metadata( From d5050fc1fd48ef846a9925daf4b39175fbbee581 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 23:22:51 -0400 Subject: [PATCH 038/133] Add kangaroo_twelve() utility function. --- deepwell/importer/requirements.txt | 1 + deepwell/importer/utils.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/deepwell/importer/requirements.txt b/deepwell/importer/requirements.txt index 946f3abb60..5c14644825 100644 --- a/deepwell/importer/requirements.txt +++ b/deepwell/importer/requirements.txt @@ -1,2 +1,3 @@ boto3>=1.34.0 +pycryptodome>=3.20.0 py7zr>=0.21.0 diff --git a/deepwell/importer/utils.py b/deepwell/importer/utils.py index e6f7bda7cc..41d8f53d2a 100644 --- a/deepwell/importer/utils.py +++ b/deepwell/importer/utils.py @@ -1,3 +1,5 @@ +from Crypto.Hash import KangarooTwelve + from typing import Optional @@ -6,3 +8,9 @@ def from_js_timestamp(value: Optional[int]) -> Optional[int]: return None else: return value // 1000 + + +def kangaroo_twelve(input: str) -> str: + data = input.encode("utf-8") + hash = KangarooTwelve.new(custom=data) + return hash.read(26).hex() From 7602cf8d4f814fa94e3dc98037cba18fb133a9d3 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 23:24:14 -0400 Subject: [PATCH 039/133] Add text table. --- deepwell/importer/database.py | 20 ++++++++++++++++++-- deepwell/importer/seed.sql | 5 +++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 621349419a..43a63bb9d9 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -1,10 +1,10 @@ -import os import json import logging +import os import sqlite3 from .wikicomma_config import SiteData -from .utils import from_js_timestamp +from .utils import kangaroo_twelve, from_js_timestamp logger = logging.getLogger(__name__) @@ -37,6 +37,22 @@ def add_user_block(self, block: dict, filename: str) -> None: for data in block.values(): self.add_user(cur, data) + def add_text(self, cur, contents: str) -> str: + logger.debug("Adding text entry (len %d)", len(contents)) + + hex_hash = kangaroo_twelve(contents) + cur.execute( + """ + INSERT INTO text + VALUES (hex_hash, contents) + (?, ?) + ON CONFLICT + DO NOTHING + """, + (hex_hash, contents), + ) + return hex_hash + def add_site(self, *, slug: str, descr: str, url: str, id: int) -> None: logger.info( "Inserting site '%s' (%s, %d)", diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 4c8ce2d09c..060405487e 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -3,6 +3,11 @@ CREATE TABLE blob ( length INTEGER NOT NULL ); +CREATE TABLE text ( + hex_hash TEXT PRIMARY KEY, + contents TEXT NOT NULL +); + CREATE TABLE site ( site_slug TEXT PRIMARY KEY, site_descr TEXT NOT NULL, -- Wikicomma name From fdf9e1b597ca902b645e8c1a23b81f45554edf5a Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 23:27:44 -0400 Subject: [PATCH 040/133] Add wikitext storage to page revisions. Separate table to enable easier intermediate processing. --- deepwell/importer/database.py | 22 +++++++++++++++++++++- deepwell/importer/seed.sql | 5 +++++ deepwell/importer/site.py | 2 +- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 43a63bb9d9..b80400decd 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -185,7 +185,7 @@ def add_page_metadata(self, cur, page_id: int, metadata: dict) -> None: ), ) - def add_page_revision(self, cur, page_id: int, data: dict) -> None: + def add_page_revision_metadata(self, cur, page_id: int, data: dict) -> None: logger.info("Inserting page revision %d for page ID %d", data["revision"], page_id) cur.execute( @@ -216,6 +216,26 @@ def add_page_revision(self, cur, page_id: int, data: dict) -> None: ), ) + def add_page_revision_wikitext(self, cur, *, revision_id: int, contents: str) -> None: + logger.debug("Inserting page revision wikitext for %d", revision_id) + + hex_hash = self.add_text(cur, contents) + cur.execute( + """ + INSERT INTO page_revision_wikitext + (revision_id, wikitext_hash) + VALUES (?, ?) + ON CONFLICT + DO UPDATE + SET wikitext_hash = ? + """, + ( + revision_id, + hex_hash, + hex_hash, + ), + ) + def add_page_vote(self, cur, *, page_id: int, user_id: int, vote_value: int) -> None: logger.info("Inserting page vote for page ID %d / user ID %d (value %d)", page_id, user_id, vote_value) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 060405487e..11916fbbb1 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -43,6 +43,11 @@ CREATE TABLE page_revision ( UNIQUE (page_id, revision_number) ); +CREATE TABLE page_revision_wikitext ( + revision_id INTEGER PRIMARY KEY REFERENCES page_revision(revision_id), + wikitext_hash TEXT NOT NULL REFERENCES text(hex_hash) +); + CREATE TABLE page_vote ( page_id INTEGER REFERENCES page(page_id), user_id INTEGER REFERENCES user(user_id), diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 3a2d94cf0f..8126ebf110 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -179,7 +179,7 @@ def process_page_metadata(self) -> None: def process_page_revisions(self, cur, page_id: int, revisions: list[dict]) -> None: logger.debug("Ingesting page revision metadata for page ID %d", page_id) for revision in revisions: - self.database.add_page_revision(cur, page_id, revision) + self.database.add_page_revision_metadata(cur, page_id, revision) def process_page_votes(self, cur, page_id: int, votes: list[Tuple[int, int]]) -> None: logger.debug("Ingesting page votes for page ID %d", page_id) From d5c7a08334718f924e08c7e771b813bafdf09fe2 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Mon, 17 Jun 2024 23:29:57 -0400 Subject: [PATCH 041/133] Add quotes. --- deepwell/importer/site.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 8126ebf110..eaa7699e30 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -154,7 +154,7 @@ def process_page_metadata(self) -> None: logger.info("Ingesting page revision metadata for site %s", self.site_slug) meta_directory = self.meta_path("pages") for path in os.listdir(meta_directory): - logger.debug("Processing page metadata %s", path) + logger.debug("Processing page metadata '%s'", path) page_slug, ext = os.path.splitext(path) assert ext == ".json", "Extension for page metadata not JSON" From 24acb5eeb9ee9e226831f45a660dc85e510a3014 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Tue, 18 Jun 2024 00:11:42 -0400 Subject: [PATCH 042/133] Add page revision wikitext extraction. --- deepwell/importer/database.py | 2 +- deepwell/importer/site.py | 64 +++++++++++++++++++++++++++++++++-- 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index b80400decd..aed5dd38d4 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -216,7 +216,7 @@ def add_page_revision_metadata(self, cur, page_id: int, data: dict) -> None: ), ) - def add_page_revision_wikitext(self, cur, *, revision_id: int, contents: str) -> None: + def add_page_revision_wikitext(self, cur, revision_id: int, contents: str) -> None: logger.debug("Inserting page revision wikitext for %d", revision_id) hex_hash = self.add_text(cur, contents) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index eaa7699e30..53bf9d4c50 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -3,9 +3,12 @@ import os import re from functools import cache +from io import BytesIO from typing import Tuple, Union from urllib.request import urlopen +import py7zr + from .database import Database SITE_ID_REGEX = re.compile(r"WIKIREQUEST\.info\.siteId = (\d+);") @@ -102,6 +105,21 @@ def get_page_id(self, page_slug: str) -> int: raise RuntimeError(f"Cannot find page ID for page '{page_slug}' in site '{self.site_slug}'") + def get_revision_id(self, cur, page_id: int, revision_number: int) -> int: + result = cur.execute( + """ + SELECT revision_id + FROM page_revision + WHERE page_id = ? + AND revision_number = ? + """, + (page_id, revision_number), + ) + if result is None: + raise RuntimeError(f"Cannot find page revision for (page {page_id}, rev {revision_number})") + (revision_id,) = result + return revision_id + def file_dir(self) -> str: return os.path.join(self.directory, "files") @@ -132,6 +150,7 @@ def run(self) -> None: def process_pages(self) -> None: self.process_page_ids() self.process_page_metadata() + self.process_page_wikitext() # TODO ... @@ -154,7 +173,7 @@ def process_page_metadata(self) -> None: logger.info("Ingesting page revision metadata for site %s", self.site_slug) meta_directory = self.meta_path("pages") for path in os.listdir(meta_directory): - logger.debug("Processing page metadata '%s'", path) + logger.debug("Processing page metadata from '%s'", path) page_slug, ext = os.path.splitext(path) assert ext == ".json", "Extension for page metadata not JSON" @@ -173,10 +192,10 @@ def process_page_metadata(self) -> None: page_id, metadata, ) - self.process_page_revisions(cur, page_id, metadata["revisions"]) + self.process_page_revisions_metadata(cur, page_id, metadata["revisions"]) self.process_page_votes(cur, page_id, metadata["votings"]) - def process_page_revisions(self, cur, page_id: int, revisions: list[dict]) -> None: + def process_page_revisions_metadata(self, cur, page_id: int, revisions: list[dict]) -> None: logger.debug("Ingesting page revision metadata for page ID %d", page_id) for revision in revisions: self.database.add_page_revision_metadata(cur, page_id, revision) @@ -187,6 +206,45 @@ def process_page_votes(self, cur, page_id: int, votes: list[Tuple[int, int]]) -> int_value = 1 if bool_value else -1 self.database.add_page_vote(cur, user_id=user_id, page_id=page_id, value=int_value) + def process_page_wikitext(self) -> None: + logger.info("Ingesting page wikitext for site %s", self.site_slug) + for path in os.listdir(self.page_dir): + logger.debug("Processing page wikitext from '%s'", path) + + page_slug, ext = os.path.splitext(path) + assert ext == ".7z", "Extension for page wikitexts not 7z" + path = os.path.join(self.page_dir, path) + + page_slug = self.convert_page_slug(page_slug) + page_id = self.get_page_id(page_slug) + + # Extract page sources for each revision + with py7zr.SevenZipFile(path, "r") as archive: + sources = archive.readall() + + # Convert and begin adding to the database + self.process_page_revisions_wikitext(page_id, sources) + + def process_page_revisions_wikitext(self, page_id: int, sources: dict[str, BytesIO]) -> None: + logger.debug("Ingesting %d page revision wikitexts", len(sources)) + + with self.database.conn as cur: + for filename, buf in sources.items(): + # Get revision number from filename + revision_number_str, ext = os.path.splitext(filename) + assert ext == ".txt", "Extension for page revision wikitext not txt" + revision_number = int(revision_number_str) + logger.info("Ingesting page revision %d (%d)", page_id, revision_number) + + # Get revision ID + revision_id = self.get_revision_id(cur, page_id, revision_number) + + # Converting from binary, mostly to ensure it's UTF-8 + contents = buf.read().decode("utf-8") + + # Run ingestion for this revision + self.database.add_page_revision_wikitext(cur, revision_id, contents) + def process_files(self) -> None: # TODO ... From b53dd1eda7de97bef0678b8a8a36cc1dbad13b5a Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Tue, 18 Jun 2024 00:13:15 -0400 Subject: [PATCH 043/133] Change to properties. --- deepwell/importer/site.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 53bf9d4c50..d180f2cb67 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -120,12 +120,15 @@ def get_revision_id(self, cur, page_id: int, revision_number: int) -> int: (revision_id,) = result return revision_id + @property def file_dir(self) -> str: return os.path.join(self.directory, "files") + @property def forum_dir(self) -> str: return os.path.join(self.directory, "forum") + @property def page_dir(self) -> str: return os.path.join(self.directory, "pages") From ed23cb97e251f549a8caa0a35255e7bb0e7dd647 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Wed, 19 Jun 2024 22:46:01 -0400 Subject: [PATCH 044/133] Fix queries. --- deepwell/importer/database.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index aed5dd38d4..181ad19067 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -44,7 +44,8 @@ def add_text(self, cur, contents: str) -> str: cur.execute( """ INSERT INTO text - VALUES (hex_hash, contents) + (hex_hash, contents) + VALUES (?, ?) ON CONFLICT DO NOTHING @@ -248,7 +249,7 @@ def add_page_vote(self, cur, *, page_id: int, user_id: int, vote_value: int) -> value ) VALUES - (?, ?, ?), + (?, ?, ?) ON CONFLICT DO UPDATE SET value = ? From e1af90f50bae417cc268f27ae7f36c67a1f49b12 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Wed, 19 Jun 2024 22:58:09 -0400 Subject: [PATCH 045/133] Fix get_revision_id() return value. --- deepwell/importer/site.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index d180f2cb67..a2fe8456c9 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -114,7 +114,7 @@ def get_revision_id(self, cur, page_id: int, revision_number: int) -> int: AND revision_number = ? """, (page_id, revision_number), - ) + ).fetchone() if result is None: raise RuntimeError(f"Cannot find page revision for (page {page_id}, rev {revision_number})") (revision_id,) = result From 9a650260bd138f7625c323a31672bd9e2774af40 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Wed, 19 Jun 2024 23:34:53 -0400 Subject: [PATCH 046/133] Update metadata title retrieval. --- deepwell/importer/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 181ad19067..09fdc11e02 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -180,7 +180,7 @@ def add_page_metadata(self, cur, page_id: int, metadata: dict) -> None: ( metadata["page_id"], metadata["sitemap_update"] // 1000, - metadata["title"], + metadata.get("title", ""), metadata["is_locked"], json.dumps(metadata.get("tags", [])), ), From f7b9601ffd59a6b970b8eb936395ffaad6b8ccbe Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 00:45:39 -0400 Subject: [PATCH 047/133] Add page_descr column to page_metadata table. --- deepwell/importer/database.py | 9 ++++++--- deepwell/importer/seed.sql | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 09fdc11e02..bdcc346e14 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -159,26 +159,29 @@ def add_page(self, cur, *, page_id: int, site_slug: str, page_slug: str) -> None ), ) - def add_page_metadata(self, cur, page_id: int, metadata: dict) -> None: - logger.info("Inserting page metadata for page ID %d", page_id) + def add_page_metadata(self, cur, page_descr: str, metadata: dict) -> None: + page_slug = metadata["name"] + logger.info("Inserting page metadata for page '%s'", page_slug) cur.execute( """ INSERT INTO page_metadata ( page_id, + page_descr, sitemap_updated_at, title, locked, tags ) VALUES - (?, ?, ?, ?, ?) + (?, ?, ?, ?, ?, ?) ON CONFLICT DO NOTHING """, ( metadata["page_id"], + page_descr, metadata["sitemap_update"] // 1000, metadata.get("title", ""), metadata["is_locked"], diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 11916fbbb1..8489924ee5 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -25,6 +25,7 @@ CREATE TABLE page ( CREATE TABLE page_metadata ( page_id INTEGER PRIMARY KEY REFERENCES page(page_id), + page_descr TEXT NOT NULL, sitemap_updated_at INTEGER NOT NULL, title TEXT NOT NULL, locked INTEGER NOT NULL CHECK (locked IN (0, 1)), -- boolean From f465ddaf8aee18e23ee66ad0ed1da6c6f614d567 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 00:46:32 -0400 Subject: [PATCH 048/133] Update get_page_id() method. --- deepwell/importer/site.py | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index a2fe8456c9..ebf7682c17 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -24,7 +24,6 @@ class SiteImporter: "site_slug", "site_url", "site_id", - "page_ids", ) def __init__( @@ -42,18 +41,6 @@ def __init__( self.site_slug = site_slug self.site_url = site_url self.site_id = self.get_site_id(site_url) - self.page_ids = {} - - @staticmethod - def convert_page_slug(page_slug: str) -> str: - if page_slug.startswith("_"): - # a _default category page that starts with an underscore, e.g. _template - return page_slug - - # replace only the first underscore - # the second (if present) is a special page, like _404 - converted, _ = re.subn("_", ":", page_slug, 1) - return converted @cache def get_site_id(self, site_url: str) -> int: @@ -83,27 +70,25 @@ def get_site_id(self, site_url: str) -> int: return int(match[1]) - def get_page_id(self, page_slug: str) -> int: - page_id = self.page_ids.get(page_slug) - if page_id is not None: - return page_id - + def get_page_id(self, page_descr: str) -> int: with self.database.conn as cur: result = cur.execute( """ - SELECT page_id FROM page - WHERE page_slug = ? - AND site_slug = ? + SELECT page.page_id + FROM page + JOIN page_metadata + ON page.page_id = page_metadata.page_id + WHERE page_metadata.page_descr = ? + AND page.site_slug = ? """, (page_slug, self.site_slug), ).fetchone() - if result is not None: - (page_id,) = result - self.page_ids[page_slug] = page_id - return page_id + if result is None: + raise RuntimeError(f"Cannot find page ID for page '{page_slug}' in site '{self.site_slug}'") - raise RuntimeError(f"Cannot find page ID for page '{page_slug}' in site '{self.site_slug}'") + (page_id,) = result + return page_id def get_revision_id(self, cur, page_id: int, revision_number: int) -> int: result = cur.execute( From 3a6fe295af517f8600aada5303f5c2c3baf8b690 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 00:47:14 -0400 Subject: [PATCH 049/133] Change logic to use page_descr. --- deepwell/importer/site.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index ebf7682c17..840b4758db 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -163,23 +163,17 @@ def process_page_metadata(self) -> None: for path in os.listdir(meta_directory): logger.debug("Processing page metadata from '%s'", path) - page_slug, ext = os.path.splitext(path) + # NOTE: Usually page_slug is the same as page_descr, but if + # there are any colons in it, then they don't match. + # So we can use it as a temporary unique identifier + # but *not* as the slug. + page_descr, ext = os.path.splitext(path) assert ext == ".json", "Extension for page metadata not JSON" path = os.path.join(meta_directory, path) - page_slug = self.convert_page_slug(page_slug) - page_id = self.get_page_id(page_slug) - metadata = self.json(path) - assert metadata["page_id"] == page_id - assert metadata["name"] == page_slug - with self.database.conn as cur: - self.database.add_page_metadata( - cur, - page_id, - metadata, - ) + self.database.add_page_metadata(cur, page_descr, metadata) self.process_page_revisions_metadata(cur, page_id, metadata["revisions"]) self.process_page_votes(cur, page_id, metadata["votings"]) @@ -199,12 +193,11 @@ def process_page_wikitext(self) -> None: for path in os.listdir(self.page_dir): logger.debug("Processing page wikitext from '%s'", path) - page_slug, ext = os.path.splitext(path) + # See above note on page_descr + page_descr, ext = os.path.splitext(path) assert ext == ".7z", "Extension for page wikitexts not 7z" path = os.path.join(self.page_dir, path) - - page_slug = self.convert_page_slug(page_slug) - page_id = self.get_page_id(page_slug) + page_id = self.get_page_id(page_descr) # Extract page sources for each revision with py7zr.SevenZipFile(path, "r") as archive: From f57ab8440b5c0f981f16c4cb228bfcc03fcd1d65 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 02:28:31 -0400 Subject: [PATCH 050/133] Remove deleted page_id cache. --- deepwell/importer/site.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 840b4758db..2f7216c774 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -149,7 +149,6 @@ def process_page_ids(self) -> None: for id_str, page_slug in mapping.items(): logger.debug("Found page '%s' (%s)", page_slug, id_str) id = int(id_str) - self.page_ids[page_slug] = id self.database.add_page( cur, site_slug=self.site_slug, From 8953cb76fd1c57ebc4230e3475d7c05e066e92b7 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 02:39:27 -0400 Subject: [PATCH 051/133] Get page_id for page metadata. --- deepwell/importer/site.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 2f7216c774..7d2c4ffa14 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -169,6 +169,7 @@ def process_page_metadata(self) -> None: page_descr, ext = os.path.splitext(path) assert ext == ".json", "Extension for page metadata not JSON" path = os.path.join(meta_directory, path) + page_id = self.get_page_id(page_descr) metadata = self.json(path) with self.database.conn as cur: From 457c1373dbb7c39800dde30dd5357c039d809c3c Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 02:39:52 -0400 Subject: [PATCH 052/133] Fix helper method. --- deepwell/importer/site.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 7d2c4ffa14..e093dd36cb 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -81,11 +81,11 @@ def get_page_id(self, page_descr: str) -> int: WHERE page_metadata.page_descr = ? AND page.site_slug = ? """, - (page_slug, self.site_slug), + (page_descr, self.site_slug), ).fetchone() if result is None: - raise RuntimeError(f"Cannot find page ID for page '{page_slug}' in site '{self.site_slug}'") + raise RuntimeError(f"Cannot find page ID for page descr '{page_descr}' in site '{self.site_slug}'") (page_id,) = result return page_id From 95791a8f3e6c866da1524b039926c37ceaf626a0 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 02:47:07 -0400 Subject: [PATCH 053/133] Get page_id after inserting. --- deepwell/importer/site.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index e093dd36cb..fc8df8a49a 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -169,11 +169,11 @@ def process_page_metadata(self) -> None: page_descr, ext = os.path.splitext(path) assert ext == ".json", "Extension for page metadata not JSON" path = os.path.join(meta_directory, path) - page_id = self.get_page_id(page_descr) metadata = self.json(path) with self.database.conn as cur: self.database.add_page_metadata(cur, page_descr, metadata) + page_id = self.get_page_id(page_descr) self.process_page_revisions_metadata(cur, page_id, metadata["revisions"]) self.process_page_votes(cur, page_id, metadata["votings"]) @@ -197,13 +197,13 @@ def process_page_wikitext(self) -> None: page_descr, ext = os.path.splitext(path) assert ext == ".7z", "Extension for page wikitexts not 7z" path = os.path.join(self.page_dir, path) - page_id = self.get_page_id(page_descr) # Extract page sources for each revision with py7zr.SevenZipFile(path, "r") as archive: sources = archive.readall() # Convert and begin adding to the database + page_id = self.get_page_id(page_descr) self.process_page_revisions_wikitext(page_id, sources) def process_page_revisions_wikitext(self, page_id: int, sources: dict[str, BytesIO]) -> None: From f7424f3ab09492faf16e9b7e7709b50d7834ef89 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 02:56:34 -0400 Subject: [PATCH 054/133] Add log messages. --- deepwell/importer/site.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index fc8df8a49a..adfca405ed 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -227,9 +227,11 @@ def process_page_revisions_wikitext(self, page_id: int, sources: dict[str, Bytes self.database.add_page_revision_wikitext(cur, revision_id, contents) def process_files(self) -> None: + logger.info("Ingesting files for site %s", self.site_slug) # TODO ... def process_forum(self) -> None: + logger.info("Ingesting forum data for site %s", self.site_slug) # TODO ... From 9725a6bc667304cb9e8e16b04d59ef819425e9d4 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 02:59:53 -0400 Subject: [PATCH 055/133] Update schema for file table. --- deepwell/importer/seed.sql | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 8489924ee5..efc66e9a1d 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -59,10 +59,8 @@ CREATE TABLE page_vote ( CREATE TABLE file ( file_id INTEGER PRIMARY KEY, - site_slug TEXT NOT NULL REFERENCES site(site_slug), - page_slug TEXT NOT NULL, - - FOREIGN KEY (site_slug, page_slug) REFERENCES page(site_slug, page_slug) + page_id INTEGER NOT NULL REFERENCES page(page_id), + site_slug TEXT NOT NULL REFERENCES site(site_slug) ); CREATE TABLE user ( From 2dd4b0f3c34576a25ded247e673e680a76b85909 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 03:05:55 -0400 Subject: [PATCH 056/133] Add get_page_descr() helper method. --- deepwell/importer/site.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index adfca405ed..c38468e4f0 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -74,7 +74,7 @@ def get_page_id(self, page_descr: str) -> int: with self.database.conn as cur: result = cur.execute( """ - SELECT page.page_id + SELECT page_metadata.page_id FROM page JOIN page_metadata ON page.page_id = page_metadata.page_id @@ -90,6 +90,26 @@ def get_page_id(self, page_descr: str) -> int: (page_id,) = result return page_id + def get_page_descr(self, page_id: int) -> str: + with self.database.conn as cur: + result = cur.execute( + """ + SELECT page_metadata.page_descr + FROM page + JOIN page_metadata + ON page.page_id = page_metadata.page_id + WHERE page_metadata.page_id = ? + AND page.site_slug = ? + """, + (page_id, self.site_slug), + ).fetchone() + + if result is None: + raise RuntimeError(f"Cannot find page descr for page ID {page_id} in site '{self.site_slug}'") + + (page_descr,) = result + return page_descr + def get_revision_id(self, cur, page_id: int, revision_number: int) -> int: result = cur.execute( """ From f7dd20d87deb6ec5e84dac0f91fa90857c88a8e0 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 03:27:00 -0400 Subject: [PATCH 057/133] Start process_files() implementation. --- deepwell/importer/site.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index c38468e4f0..94e1c5b006 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -5,6 +5,7 @@ from functools import cache from io import BytesIO from typing import Tuple, Union +from urllib.parse import quote as percent_quote from urllib.request import urlopen import py7zr @@ -248,6 +249,17 @@ def process_page_revisions_wikitext(self, page_id: int, sources: dict[str, Bytes def process_files(self) -> None: logger.info("Ingesting files for site %s", self.site_slug) + + mapping = self.json(self.meta_path("file_map.json")) + for file_id, entry in mapping.items(): + file_id = int(file_id) + wikidot_url = entry["url"] + page_slug_url, filename = os.path.split(entry["path"]) + page_slug = percent_quote(page_slug_url) + logger.debug("Processing file stored at %s", wikidot_url) + + # TODO + # TODO ... From e9278b63ad924e24348f6ab61abc7eed93880cea Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 03:36:33 -0400 Subject: [PATCH 058/133] Write output to log file too. --- deepwell/importer/__main__.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/deepwell/importer/__main__.py b/deepwell/importer/__main__.py index b1cc28c651..96f20879d6 100644 --- a/deepwell/importer/__main__.py +++ b/deepwell/importer/__main__.py @@ -10,6 +10,7 @@ LOG_FORMAT = "[%(levelname)s] %(asctime)s %(name)s: %(message)s" LOG_DATE_FORMAT = "%Y/%m/%d %H:%M:%S" +LOG_FILENAME = "import.log" if __name__ == "__main__": argparser = argparse.ArgumentParser(description="WikiComma importer") @@ -21,6 +22,12 @@ action="store_false", help="Don't output to standard out", ) + argparser.add_argument( + "--log", + dest="log_file", + default=LOG_FILENAME, + help="The log file to write to", + ) argparser.add_argument( "-c", "--config", @@ -69,12 +76,18 @@ args = argparser.parse_args() log_fmtr = logging.Formatter(LOG_FORMAT, datefmt=LOG_DATE_FORMAT) - log_stdout = logging.StreamHandler(sys.stdout) - log_stdout.setFormatter(log_fmtr) + + log_file = logging.FileHandler(filename=LOG_FILENAME, encoding="utf-8", mode=logging.DEBUG) + log_file.setFormatter(log_fmtr) logger = logging.getLogger(__package__) logger.setLevel(level=logging.DEBUG) - logger.addHandler(log_stdout) + logger.addHandler(log_file) + + if not args.quiet: + log_stdout = logging.StreamHandler(sys.stdout) + log_stdout.setFormatter(log_fmtr) + logger.addHandler(log_stdout) wikicomma_config = parse_config(args.wikicomma_config) From 7d803b778a95ce22504f866c5d13ad6acb7e4496 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 14:08:08 -0400 Subject: [PATCH 059/133] Fix log file mode. --- deepwell/importer/__main__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/__main__.py b/deepwell/importer/__main__.py index 96f20879d6..1c93910ce6 100644 --- a/deepwell/importer/__main__.py +++ b/deepwell/importer/__main__.py @@ -11,6 +11,7 @@ LOG_FORMAT = "[%(levelname)s] %(asctime)s %(name)s: %(message)s" LOG_DATE_FORMAT = "%Y/%m/%d %H:%M:%S" LOG_FILENAME = "import.log" +LOG_FILE_MODE = "a" if __name__ == "__main__": argparser = argparse.ArgumentParser(description="WikiComma importer") @@ -77,7 +78,7 @@ log_fmtr = logging.Formatter(LOG_FORMAT, datefmt=LOG_DATE_FORMAT) - log_file = logging.FileHandler(filename=LOG_FILENAME, encoding="utf-8", mode=logging.DEBUG) + log_file = logging.FileHandler(filename=LOG_FILENAME, encoding="utf-8", mode=LOG_FILE_MODE) log_file.setFormatter(log_fmtr) logger = logging.getLogger(__package__) From beeca1e18a03d78a4cd025eeef94032d735b695e Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 14:15:12 -0400 Subject: [PATCH 060/133] Fix argument processing. --- deepwell/importer/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/__main__.py b/deepwell/importer/__main__.py index 1c93910ce6..cbd9658086 100644 --- a/deepwell/importer/__main__.py +++ b/deepwell/importer/__main__.py @@ -85,7 +85,7 @@ logger.setLevel(level=logging.DEBUG) logger.addHandler(log_file) - if not args.quiet: + if args.stdout: log_stdout = logging.StreamHandler(sys.stdout) log_stdout.setFormatter(log_fmtr) logger.addHandler(log_stdout) From acf3bce3e2e219d337626afcdb81b51db35b7dc2 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 14:50:22 -0400 Subject: [PATCH 061/133] Run black formatter. --- deepwell/importer/__main__.py | 6 ++++- deepwell/importer/database.py | 29 ++++++++++++++++++---- deepwell/importer/site.py | 45 ++++++++++++++++++++++++++++------- 3 files changed, 67 insertions(+), 13 deletions(-) diff --git a/deepwell/importer/__main__.py b/deepwell/importer/__main__.py index cbd9658086..db39e7625f 100644 --- a/deepwell/importer/__main__.py +++ b/deepwell/importer/__main__.py @@ -78,7 +78,11 @@ log_fmtr = logging.Formatter(LOG_FORMAT, datefmt=LOG_DATE_FORMAT) - log_file = logging.FileHandler(filename=LOG_FILENAME, encoding="utf-8", mode=LOG_FILE_MODE) + log_file = logging.FileHandler( + filename=LOG_FILENAME, + encoding="utf-8", + mode=LOG_FILE_MODE, + ) log_file.setFormatter(log_fmtr) logger = logging.getLogger(__package__) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index bdcc346e14..28f8040b36 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -137,7 +137,12 @@ def add_user(self, cur, data: dict) -> None: ) def add_page(self, cur, *, page_id: int, site_slug: str, page_slug: str) -> None: - logger.info("Inserting into site '%s' page '%s' (%d)", site_slug, page_slug, page_id) + logger.info( + "Inserting into site '%s' page '%s' (%d)", + site_slug, + page_slug, + page_id, + ) cur.execute( """ @@ -190,7 +195,11 @@ def add_page_metadata(self, cur, page_descr: str, metadata: dict) -> None: ) def add_page_revision_metadata(self, cur, page_id: int, data: dict) -> None: - logger.info("Inserting page revision %d for page ID %d", data["revision"], page_id) + logger.info( + "Inserting page revision %d for page ID %d", + data["revision"], + page_id, + ) cur.execute( """ @@ -240,8 +249,20 @@ def add_page_revision_wikitext(self, cur, revision_id: int, contents: str) -> No ), ) - def add_page_vote(self, cur, *, page_id: int, user_id: int, vote_value: int) -> None: - logger.info("Inserting page vote for page ID %d / user ID %d (value %d)", page_id, user_id, vote_value) + def add_page_vote( + self, + cur, + *, + page_id: int, + user_id: int, + vote_value: int, + ) -> None: + logger.info( + "Inserting page vote for page ID %d / user ID %d (value %d)", + page_id, + user_id, + vote_value, + ) cur.execute( """ diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 94e1c5b006..9e2b3570e6 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -86,7 +86,9 @@ def get_page_id(self, page_descr: str) -> int: ).fetchone() if result is None: - raise RuntimeError(f"Cannot find page ID for page descr '{page_descr}' in site '{self.site_slug}'") + raise RuntimeError( + f"Cannot find page ID for page descr '{page_descr}' in site '{self.site_slug}'", + ) (page_id,) = result return page_id @@ -106,7 +108,9 @@ def get_page_descr(self, page_id: int) -> str: ).fetchone() if result is None: - raise RuntimeError(f"Cannot find page descr for page ID {page_id} in site '{self.site_slug}'") + raise RuntimeError( + f"Cannot find page descr for page ID {page_id} in site '{self.site_slug}'", + ) (page_descr,) = result return page_descr @@ -122,7 +126,9 @@ def get_revision_id(self, cur, page_id: int, revision_number: int) -> int: (page_id, revision_number), ).fetchone() if result is None: - raise RuntimeError(f"Cannot find page revision for (page {page_id}, rev {revision_number})") + raise RuntimeError( + f"Cannot find page revision for (page {page_id}, rev {revision_number})", + ) (revision_id,) = result return revision_id @@ -195,19 +201,38 @@ def process_page_metadata(self) -> None: with self.database.conn as cur: self.database.add_page_metadata(cur, page_descr, metadata) page_id = self.get_page_id(page_descr) - self.process_page_revisions_metadata(cur, page_id, metadata["revisions"]) + self.process_page_revisions_metadata( + cur, + page_id, + metadata["revisions"], + ) self.process_page_votes(cur, page_id, metadata["votings"]) - def process_page_revisions_metadata(self, cur, page_id: int, revisions: list[dict]) -> None: + def process_page_revisions_metadata( + self, + cur, + page_id: int, + revisions: list[dict], + ) -> None: logger.debug("Ingesting page revision metadata for page ID %d", page_id) for revision in revisions: self.database.add_page_revision_metadata(cur, page_id, revision) - def process_page_votes(self, cur, page_id: int, votes: list[Tuple[int, int]]) -> None: + def process_page_votes( + self, + cur, + page_id: int, + votes: list[Tuple[int, int]], + ) -> None: logger.debug("Ingesting page votes for page ID %d", page_id) for user_id, bool_value in votes: int_value = 1 if bool_value else -1 - self.database.add_page_vote(cur, user_id=user_id, page_id=page_id, value=int_value) + self.database.add_page_vote( + cur, + user_id=user_id, + page_id=page_id, + value=int_value, + ) def process_page_wikitext(self) -> None: logger.info("Ingesting page wikitext for site %s", self.site_slug) @@ -227,7 +252,11 @@ def process_page_wikitext(self) -> None: page_id = self.get_page_id(page_descr) self.process_page_revisions_wikitext(page_id, sources) - def process_page_revisions_wikitext(self, page_id: int, sources: dict[str, BytesIO]) -> None: + def process_page_revisions_wikitext( + self, + page_id: int, + sources: dict[str, BytesIO], + ) -> None: logger.debug("Ingesting %d page revision wikitexts", len(sources)) with self.database.conn as cur: From 74e85e66726723cef22c6eed5e2656f03d123499 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 14:49:28 -0400 Subject: [PATCH 062/133] Remove extra newline. --- deepwell/importer/__main__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepwell/importer/__main__.py b/deepwell/importer/__main__.py index db39e7625f..76d6ff94b6 100644 --- a/deepwell/importer/__main__.py +++ b/deepwell/importer/__main__.py @@ -77,7 +77,6 @@ args = argparser.parse_args() log_fmtr = logging.Formatter(LOG_FORMAT, datefmt=LOG_DATE_FORMAT) - log_file = logging.FileHandler( filename=LOG_FILENAME, encoding="utf-8", From 4aaa85cf80baf1c41160076bfd5d79604c1cbffc Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 14:50:38 -0400 Subject: [PATCH 063/133] Unify page table schema. --- deepwell/importer/seed.sql | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index efc66e9a1d..2593c9552f 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -17,19 +17,16 @@ CREATE TABLE site ( CREATE TABLE page ( page_id INTEGER PRIMARY KEY, - site_slug TEXT NOT NULL REFERENCES site(site_slug), - page_slug TEXT NOT NULL, - - UNIQUE (site_slug, page_slug) -); - -CREATE TABLE page_metadata ( - page_id INTEGER PRIMARY KEY REFERENCES page(page_id), page_descr TEXT NOT NULL, + page_slug TEXT NOT NULL, + site_slug TEXT NOT NULL REFERENCES site(site_slug), sitemap_updated_at INTEGER NOT NULL, title TEXT NOT NULL, locked INTEGER NOT NULL CHECK (locked IN (0, 1)), -- boolean - tags TEXT NOT NULL -- JSON + tags TEXT NOT NULL, -- JSON + + UNIQUE (site_slug, page_descr), + UNIQUE (site_slug, page_slug) ); CREATE TABLE page_revision ( From e67b5d2b6d0d9ce11f134d761b8f60f4a59accde Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 15:10:17 -0400 Subject: [PATCH 064/133] Update add_page() method for unified system. --- deepwell/importer/database.py | 37 ++++++++--------------------------- deepwell/importer/site.py | 32 ++++++++++-------------------- 2 files changed, 18 insertions(+), 51 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 28f8040b36..b1786ccdc6 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -136,57 +136,36 @@ def add_user(self, cur, data: dict) -> None: ), ) - def add_page(self, cur, *, page_id: int, site_slug: str, page_slug: str) -> None: + def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> None: logger.info( - "Inserting into site '%s' page '%s' (%d)", + "Inserting into site '%s' page descr '%s'", site_slug, - page_slug, - page_id, + page_descr, ) cur.execute( """ INSERT INTO page - ( - page_id, - site_slug, - page_slug - ) - VALUES - (?, ?, ?) - ON CONFLICT - DO NOTHING - """, - ( - page_id, - site_slug, - page_slug, - ), - ) - - def add_page_metadata(self, cur, page_descr: str, metadata: dict) -> None: - page_slug = metadata["name"] - logger.info("Inserting page metadata for page '%s'", page_slug) - - cur.execute( - """ - INSERT INTO page_metadata ( page_id, page_descr, + page_slug, + site_slug, sitemap_updated_at, title, locked, tags ) VALUES - (?, ?, ?, ?, ?, ?) + (?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT DO NOTHING """, ( metadata["page_id"], page_descr, + metadata["name"], + site_slug, metadata["sitemap_update"] // 1000, metadata.get("title", ""), metadata["is_locked"], diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 9e2b3570e6..020e3b17ba 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -75,12 +75,10 @@ def get_page_id(self, page_descr: str) -> int: with self.database.conn as cur: result = cur.execute( """ - SELECT page_metadata.page_id + SELECT page_id FROM page - JOIN page_metadata - ON page.page_id = page_metadata.page_id - WHERE page_metadata.page_descr = ? - AND page.site_slug = ? + WHERE page_descr = ? + AND site_slug = ? """, (page_descr, self.site_slug), ).fetchone() @@ -163,26 +161,11 @@ def run(self) -> None: ... def process_pages(self) -> None: - self.process_page_ids() self.process_page_metadata() self.process_page_wikitext() # TODO ... - def process_page_ids(self) -> None: - logger.info("Ingesting page ID mappings for site %s", self.site_slug) - mapping = self.json(self.meta_path("page_id_map.json")) - with self.database.conn as cur: - for id_str, page_slug in mapping.items(): - logger.debug("Found page '%s' (%s)", page_slug, id_str) - id = int(id_str) - self.database.add_page( - cur, - site_slug=self.site_slug, - page_slug=page_slug, - page_id=id, - ) - def process_page_metadata(self) -> None: logger.info("Ingesting page revision metadata for site %s", self.site_slug) meta_directory = self.meta_path("pages") @@ -199,8 +182,13 @@ def process_page_metadata(self) -> None: metadata = self.json(path) with self.database.conn as cur: - self.database.add_page_metadata(cur, page_descr, metadata) - page_id = self.get_page_id(page_descr) + self.database.add_page( + cur, + site_slug=self.site_slug, + page_descr=page_descr, + metadata=metadata, + ) + page_id = metadata["page_id"] self.process_page_revisions_metadata( cur, page_id, From 34ab297575b5d63f1331b8504399e594e9bc451e Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 15:59:06 -0400 Subject: [PATCH 065/133] Call all stubs. --- deepwell/importer/site.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 020e3b17ba..2d017cc970 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -157,14 +157,12 @@ def run(self) -> None: id=self.site_id, ) self.process_pages() - # TODO - ... + self.process_files() + self.process_forum() def process_pages(self) -> None: self.process_page_metadata() self.process_page_wikitext() - # TODO - ... def process_page_metadata(self) -> None: logger.info("Ingesting page revision metadata for site %s", self.site_slug) From c3cc44d4e5951772f904f205cd0c24e7420d58fa Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 15:59:51 -0400 Subject: [PATCH 066/133] Return s3_path after upload. --- deepwell/importer/s3.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py index c21bc12811..1ab364296a 100644 --- a/deepwell/importer/s3.py +++ b/deepwell/importer/s3.py @@ -29,7 +29,7 @@ def exists(self, s3_path: str) -> bool: except: return False - def upload(self, file_path: str) -> None: + def upload(self, file_path: str) -> str: with open(path, "rb") as file: data = file.read() s3_path = hashlib.sha256(data).hexdigest() @@ -46,3 +46,5 @@ def upload(self, file_path: str) -> None: Body=data, ContentLength=len(data), ) + + return s3_path From 632eb5aa2df829df7e74ee0502129dc39a5b26b1 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 16:02:18 -0400 Subject: [PATCH 067/133] Add missing import. --- deepwell/importer/s3.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py index 1ab364296a..64c8b4de06 100644 --- a/deepwell/importer/s3.py +++ b/deepwell/importer/s3.py @@ -1,3 +1,4 @@ +import hashlib import logging import boto3 @@ -32,7 +33,7 @@ def exists(self, s3_path: str) -> bool: def upload(self, file_path: str) -> str: with open(path, "rb") as file: data = file.read() - s3_path = hashlib.sha256(data).hexdigest() + s3_path = hashlib.sha256(data).hexdigest() # files use SHA256, text uses K12 if not data: logger.debug("Skipping upload of empty S3 object") From 2f41c63214b8714262539361b4732e8af4394258 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 16:07:52 -0400 Subject: [PATCH 068/133] Add s3_hash to file table. --- deepwell/importer/importer.py | 1 + deepwell/importer/seed.sql | 3 ++- deepwell/importer/site.py | 6 ++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index 17171b63fc..dd75ba6baa 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -82,6 +82,7 @@ def process_site(self, site_descr: str) -> None: site_importer = SiteImporter( directory=directory, database=self.database, + s3=self.s3, site_descr=site_data.descr, site_slug=site_data.slug, site_url=site_data.url, diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 2593c9552f..64a82a68fb 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -57,7 +57,8 @@ CREATE TABLE page_vote ( CREATE TABLE file ( file_id INTEGER PRIMARY KEY, page_id INTEGER NOT NULL REFERENCES page(page_id), - site_slug TEXT NOT NULL REFERENCES site(site_slug) + site_slug TEXT NOT NULL REFERENCES site(site_slug), + s3_hash TEXT NOT NULL ); CREATE TABLE user ( diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 2d017cc970..5bf4aa61bc 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -11,6 +11,7 @@ import py7zr from .database import Database +from .s3 import S3 SITE_ID_REGEX = re.compile(r"WIKIREQUEST\.info\.siteId = (\d+);") @@ -21,6 +22,7 @@ class SiteImporter: __slots__ = ( "directory", "database", + "s3", "site_descr", "site_slug", "site_url", @@ -32,12 +34,14 @@ def __init__( *, directory: str, database: Database, + s3: S3, site_descr: str, site_slug: str, site_url: str, ) -> None: self.directory = directory self.database = database + self.s3 = s3 self.site_descr = site_descr self.site_slug = site_slug self.site_url = site_url @@ -273,6 +277,8 @@ def process_files(self) -> None: page_slug = percent_quote(page_slug_url) logger.debug("Processing file stored at %s", wikidot_url) + self. + # TODO # TODO From 42b998d48ae71a732072c45b535e2e986cfdfd44 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 16:30:21 -0400 Subject: [PATCH 069/133] Move comment placement. --- deepwell/importer/s3.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py index 64c8b4de06..70e69432b5 100644 --- a/deepwell/importer/s3.py +++ b/deepwell/importer/s3.py @@ -33,7 +33,8 @@ def exists(self, s3_path: str) -> bool: def upload(self, file_path: str) -> str: with open(path, "rb") as file: data = file.read() - s3_path = hashlib.sha256(data).hexdigest() # files use SHA256, text uses K12 + # files use SHA256, text uses K12 + s3_path = hashlib.sha256(data).hexdigest() if not data: logger.debug("Skipping upload of empty S3 object") From 061f0f45063914419fae9f18e7eaa7bad16185b9 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 16:39:47 -0400 Subject: [PATCH 070/133] Fix runtime issues in s3.py --- deepwell/importer/s3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py index 70e69432b5..3dfbf6cdc0 100644 --- a/deepwell/importer/s3.py +++ b/deepwell/importer/s3.py @@ -31,7 +31,7 @@ def exists(self, s3_path: str) -> bool: return False def upload(self, file_path: str) -> str: - with open(path, "rb") as file: + with open(file_path, "rb") as file: data = file.read() # files use SHA256, text uses K12 s3_path = hashlib.sha256(data).hexdigest() @@ -43,7 +43,7 @@ def upload(self, file_path: str) -> str: else: logger.info("Uploading S3 object %s (len %d)", s3_path, len(data)) self.client.upload_file( - Bucket=self.s3_bucket, + Bucket=self.bucket, Key=s3_path, Body=data, ContentLength=len(data), From e25308d419f2524f3f73cb6a47002b71959752fd Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 16:39:58 -0400 Subject: [PATCH 071/133] Add method for adding file row. --- deepwell/importer/database.py | 40 +++++++++++++++++++++++++++++++++++ deepwell/importer/seed.sql | 1 + 2 files changed, 41 insertions(+) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index b1786ccdc6..f51cd62660 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -264,3 +264,43 @@ def add_page_vote( vote_value, ), ) + + def add_file( + self, + cur, + *, + file_id: int, + page_id: int, + site_slug: str, + filename: str, + s3_hash: str, + ) -> None: + logger.info("Inserting file for page ID %d", page_id) + + cur.execute( + """ + INSERT INTO file + ( + file_id, + page_id, + site_slug, + filename, + s3_hash + ) + VALUES + (?, ?, ?, ?) + ON CONFLICT + DO UPDATE + SET filename = ?, + s3_hash = ? + """, + ( + file_id, + page_id, + site_slug, + filename, + s3_hash, + filename, + s3_hash, + ), + ) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 64a82a68fb..d4b19e9adf 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -58,6 +58,7 @@ CREATE TABLE file ( file_id INTEGER PRIMARY KEY, page_id INTEGER NOT NULL REFERENCES page(page_id), site_slug TEXT NOT NULL REFERENCES site(site_slug), + filename TEXT NOT NULL, s3_hash TEXT NOT NULL ); From a5960cc65c22455d4a8117b1f44ead952b75210b Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 16:40:10 -0400 Subject: [PATCH 072/133] Use match statement for get_page_id() method. --- deepwell/importer/site.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 5bf4aa61bc..8c93f425f9 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -5,7 +5,7 @@ from functools import cache from io import BytesIO from typing import Tuple, Union -from urllib.parse import quote as percent_quote +from urllib.parse import unquote as percent_unquote from urllib.request import urlopen import py7zr @@ -75,21 +75,33 @@ def get_site_id(self, site_url: str) -> int: return int(match[1]) - def get_page_id(self, page_descr: str) -> int: + def get_page_id(self, *, page_slug: str = None, page_descr: str = None) -> int: with self.database.conn as cur: - result = cur.execute( - """ - SELECT page_id - FROM page - WHERE page_descr = ? - AND site_slug = ? - """, - (page_descr, self.site_slug), - ).fetchone() + match bool(page_slug), bool(page_descr): + case True, False: + query = """ + SELECT page_id + FROM page + WHERE page_slug = ? + AND site_slug = ? + """ + parameters = (page_slug, self.site_slug) + case False, True: + query = """ + SELECT page_id + FROM page + WHERE page_descr = ? + AND site_slug = ? + """ + parameters = (page_descr, self.site_slug) + case _, _: + raise ValueError("Must pass exactly one parameter into get_page_id()") + + result = cur.execute(query, parameters).fetchone() if result is None: raise RuntimeError( - f"Cannot find page ID for page descr '{page_descr}' in site '{self.site_slug}'", + f"Cannot find page ID for page_descr={page_descr} / page_slug={page_slug} in site '{self.site_slug}'", ) (page_id,) = result From f613241c243086048100d405668e9f15481f9444 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 16:44:55 -0400 Subject: [PATCH 073/133] Implement file uploads. --- deepwell/importer/database.py | 2 +- deepwell/importer/s3.py | 2 +- deepwell/importer/site.py | 33 +++++++++++++++++++++++---------- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index f51cd62660..5a606211c7 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -288,7 +288,7 @@ def add_file( s3_hash ) VALUES - (?, ?, ?, ?) + (?, ?, ?, ?, ?) ON CONFLICT DO UPDATE SET filename = ?, diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py index 3dfbf6cdc0..e07f11017c 100644 --- a/deepwell/importer/s3.py +++ b/deepwell/importer/s3.py @@ -42,7 +42,7 @@ def upload(self, file_path: str) -> str: logger.debug("S3 object %s already exists", s3_path) else: logger.info("Uploading S3 object %s (len %d)", s3_path, len(data)) - self.client.upload_file( + self.client.put_object( Bucket=self.bucket, Key=s3_path, Body=data, diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 8c93f425f9..ee8af262a3 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -95,7 +95,9 @@ def get_page_id(self, *, page_slug: str = None, page_descr: str = None) -> int: """ parameters = (page_descr, self.site_slug) case _, _: - raise ValueError("Must pass exactly one parameter into get_page_id()") + raise ValueError( + "Must pass exactly one parameter into get_page_id()", + ) result = cur.execute(query, parameters).fetchone() @@ -250,8 +252,8 @@ def process_page_wikitext(self) -> None: with py7zr.SevenZipFile(path, "r") as archive: sources = archive.readall() + page_id = self.get_page_id(page_descr=page_descr) # Convert and begin adding to the database - page_id = self.get_page_id(page_descr) self.process_page_revisions_wikitext(page_id, sources) def process_page_revisions_wikitext( @@ -282,16 +284,27 @@ def process_files(self) -> None: logger.info("Ingesting files for site %s", self.site_slug) mapping = self.json(self.meta_path("file_map.json")) - for file_id, entry in mapping.items(): - file_id = int(file_id) - wikidot_url = entry["url"] - page_slug_url, filename = os.path.split(entry["path"]) - page_slug = percent_quote(page_slug_url) - logger.debug("Processing file stored at %s", wikidot_url) + with self.database.conn as cur: + for file_id, entry in mapping.items(): + file_id = int(file_id) + wikidot_url = entry["url"] + logger.debug("Processing file stored at %s", wikidot_url) + + page_slug_url, filename = os.path.split(entry["path"]) + page_slug = percent_unquote(page_slug_url) + page_id = self.get_page_id(page_slug=page_slug) - self. + path = os.path.join(self.file_dir, page_slug_url, str(file_id)) + s3_hash = self.s3.upload(path) - # TODO + self.database.add_file( + cur, + file_id=file_id, + page_id=page_id, + site_slug=self.site_slug, + filename=filename, + s3_hash=s3_hash, + ) # TODO ... From b6d27760e04a3e76b368f0e6b9190d12c248fb0e Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 16:49:05 -0400 Subject: [PATCH 074/133] Remove TODO comment. --- deepwell/importer/site.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index ee8af262a3..aff089f3fc 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -306,9 +306,6 @@ def process_files(self) -> None: s3_hash=s3_hash, ) - # TODO - ... - def process_forum(self) -> None: logger.info("Ingesting forum data for site %s", self.site_slug) # TODO From 990923843bfe151c7d0fa7b9651e99c399cc9c57 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 18:24:05 -0400 Subject: [PATCH 075/133] Add forum tables to schema. --- deepwell/importer/seed.sql | 71 +++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index d4b19e9adf..5e952af85f 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -8,6 +8,21 @@ CREATE TABLE text ( contents TEXT NOT NULL ); +CREATE TABLE user ( + user_slug TEXT PRIMARY KEY, + user_name TEXT NOT NULL, + user_id INTEGER NOT NULL UNIQUE, + user_since INTEGER NOT NULL, + account_type TEXT NOT NULL, + karma INTEGER NOT NULL, + fetched_at INTEGER NOT NULL, + real_name TEXT, + gender TEXT, + birthday INTEGER, + location TEXT, + website TEXT +); + CREATE TABLE site ( site_slug TEXT PRIMARY KEY, site_descr TEXT NOT NULL, -- Wikicomma name @@ -62,17 +77,47 @@ CREATE TABLE file ( s3_hash TEXT NOT NULL ); -CREATE TABLE user ( - user_slug TEXT PRIMARY KEY, - user_name TEXT NOT NULL, - user_id INTEGER NOT NULL UNIQUE, - user_since INTEGER NOT NULL, - account_type TEXT NOT NULL, - karma INTEGER NOT NULL, - fetched_at INTEGER NOT NULL, - real_name TEXT, - gender TEXT, - birthday INTEGER, - location TEXT, - website TEXT +CREATE TABLE forum_category ( + forum_category_id INTEGER PRIMARY KEY, + site_slug TEXT NOT NULL REFERENCES site(site_slug), + title TEXT NOT NULL, + description TEXT NOT NULL, + last_user_id INTEGER NOT NULL REFERENCES user(user_id), + thread_count INTEGER NOT NULL, + post_count INTEGER NOT NULL, + full_scan INTEGER NOT NULL (full_scan IN (0, 1)), -- boolean + last_page INTEGER NOT NULL, + version INTEGER NOT NULL +); + +CREATE TABLE forum_thread ( + forum_thread_id INTEGER PRIMARY KEY, + forum_category_id INTEGER NOT NULL REFERENCES forum_category(forum_category_id), + site_slug TEXT NOT NULL REFERENCES site(site_slug), + title TEXT NOT NULL, + description TEXT NOT NULL, + last_user_id INTEGER NOT NULL REFERENCES user(user_id), + thread_count INTEGER NOT NULL, + post_count INTEGER NOT NULL, + full_scan INTEGER NOT NULL (full_scan IN (0, 1)), -- boolean + last_page INTEGER NOT NULL, + version INTEGER NOT NULL +); + +CREATE TABLE forum_post ( + forum_post_id INTEGER PRIMARY KEY, + forum_thread_id INTEGER NOT NULL REFERENCES forum_thread(forum_thread_id), + title TEXT NOT NULL, + created_at INTEGER NOT NULL, + created_by INTEGER NOT NULL REFERENCES user(user_id), + edited_at INTEGER NOT NULL, + edited_by INTEGER NOT NULL REFERENCES user(user_id), +); + +CREATE TABLE forum_post_revision ( + forum_post_revision_id INTEGER PRIMARY KEY, + forum_post_id INTEGER NOT NULL REFERENCES forum_post(forum_post_id), + title TEXT NOT NULL, + created_at INTEGER NOT NULL, + created_by INTEGER NOT NULL REFERENCES user(user_id) ); From 1642d13c699a6c03d0b2fb62f22506a99a275e2c Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 19:16:17 -0400 Subject: [PATCH 076/133] Allow multiple meta paths. --- deepwell/importer/site.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index aff089f3fc..b874629340 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -160,8 +160,8 @@ def forum_dir(self) -> str: def page_dir(self) -> str: return os.path.join(self.directory, "pages") - def meta_path(self, path: str) -> str: - return os.path.join(self.directory, "meta", path) + def meta_path(self, *paths: str) -> str: + return os.path.join(self.directory, "meta", *paths) def json(self, path: str) -> Union[list, dict]: with open(path) as file: From 8406d174781069b617533f1742a12bcceed557bf Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 20:53:27 -0400 Subject: [PATCH 077/133] Add method for forum categories. --- deepwell/importer/database.py | 45 +++++++++++++++++++++++++++++++++++ deepwell/importer/seed.sql | 1 + deepwell/importer/site.py | 43 ++++++++++++++++++++++----------- 3 files changed, 75 insertions(+), 14 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 5a606211c7..ae30b7acef 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -304,3 +304,48 @@ def add_file( s3_hash, ), ) + + def add_forum_category( + self, + cur, + site_slug: str, + metadata: dict, + ) -> None: + forum_category_id = metadata["id"] + logger.info("Inserting forum category ID %d", forum_category_id) + + cur.execute( + """ + INSERT INTO forum_category + ( + forum_category_id, + site_slug, + title, + description, + last_user_id, + last_posted_at, + thread_count, + post_count, + full_scan, + last_page, + version + ) + VALUES + (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT + DO NOTHING + """, + ( + forum_category_id, + site_slug, + metadata["title"], + metadata["description"], + metadata["lastUser"], + metadata["last"], + metadata["threads"], + metadata["posts"], + metadata["full_scan"], + metadata["last_page"], + metadata["version"], + ), + ) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 5e952af85f..ad8f0c5ead 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -83,6 +83,7 @@ CREATE TABLE forum_category ( title TEXT NOT NULL, description TEXT NOT NULL, last_user_id INTEGER NOT NULL REFERENCES user(user_id), + last_posted_at INTEGER NOT NULL, thread_count INTEGER NOT NULL, post_count INTEGER NOT NULL, full_scan INTEGER NOT NULL (full_scan IN (0, 1)), -- boolean diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index b874629340..3464a42c3e 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -185,19 +185,19 @@ def process_pages(self) -> None: def process_page_metadata(self) -> None: logger.info("Ingesting page revision metadata for site %s", self.site_slug) meta_directory = self.meta_path("pages") - for path in os.listdir(meta_directory): - logger.debug("Processing page metadata from '%s'", path) - - # NOTE: Usually page_slug is the same as page_descr, but if - # there are any colons in it, then they don't match. - # So we can use it as a temporary unique identifier - # but *not* as the slug. - page_descr, ext = os.path.splitext(path) - assert ext == ".json", "Extension for page metadata not JSON" - path = os.path.join(meta_directory, path) - - metadata = self.json(path) - with self.database.conn as cur: + with self.database.conn as cur: + for path in os.listdir(meta_directory): + logger.debug("Processing page metadata from '%s'", path) + + # NOTE: Usually page_slug is the same as page_descr, but if + # there are any colons in it, then they don't match. + # So we can use it as a temporary unique identifier + # but *not* as the slug. + page_descr, ext = os.path.splitext(path) + assert ext == ".json", "Extension for page metadata not JSON" + path = os.path.join(meta_directory, path) + + metadata = self.json(path) self.database.add_page( cur, site_slug=self.site_slug, @@ -308,5 +308,20 @@ def process_files(self) -> None: def process_forum(self) -> None: logger.info("Ingesting forum data for site %s", self.site_slug) + self.process_forum_categories() # TODO - ... + + def process_forum_categories(self) -> None: + logger.debug("Processing forum categories") + directory = self.meta_path("forum", "category") + with self.database.conn as cur: + for path in os.listdir(directory): + logger.debug("Processing forum category metadata from '%s'", path) + + forum_category_id_str, ext = os.path.splitext(path) + forum_category_id = int(forum_category_id_str) + assert ext = ".json", "Extension for forum category metadata not JSON" + path = os.path.join(directory, path) + + metadata = self.json(path) + self.database.add_forum_category(cur, self.site_slug, metadata) From 3600c5c48bea465e0bca6197feadbc20777763e8 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 21:17:17 -0400 Subject: [PATCH 078/133] Fix forum category metadata ingestion. --- deepwell/importer/database.py | 2 +- deepwell/importer/seed.sql | 6 +++--- deepwell/importer/site.py | 9 +++++++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index ae30b7acef..3432b50a8f 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -331,7 +331,7 @@ def add_forum_category( version ) VALUES - (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT DO NOTHING """, diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index ad8f0c5ead..0ca145f264 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -86,7 +86,7 @@ CREATE TABLE forum_category ( last_posted_at INTEGER NOT NULL, thread_count INTEGER NOT NULL, post_count INTEGER NOT NULL, - full_scan INTEGER NOT NULL (full_scan IN (0, 1)), -- boolean + full_scan INTEGER NOT NULL CHECK (full_scan IN (0, 1)), -- boolean last_page INTEGER NOT NULL, version INTEGER NOT NULL ); @@ -100,7 +100,7 @@ CREATE TABLE forum_thread ( last_user_id INTEGER NOT NULL REFERENCES user(user_id), thread_count INTEGER NOT NULL, post_count INTEGER NOT NULL, - full_scan INTEGER NOT NULL (full_scan IN (0, 1)), -- boolean + full_scan INTEGER NOT NULL CHECK (full_scan IN (0, 1)), -- boolean last_page INTEGER NOT NULL, version INTEGER NOT NULL ); @@ -112,7 +112,7 @@ CREATE TABLE forum_post ( created_at INTEGER NOT NULL, created_by INTEGER NOT NULL REFERENCES user(user_id), edited_at INTEGER NOT NULL, - edited_by INTEGER NOT NULL REFERENCES user(user_id), + edited_by INTEGER NOT NULL REFERENCES user(user_id) ); CREATE TABLE forum_post_revision ( diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 3464a42c3e..70a408cba2 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -308,10 +308,11 @@ def process_files(self) -> None: def process_forum(self) -> None: logger.info("Ingesting forum data for site %s", self.site_slug) + self.process_forum_category_metadata() self.process_forum_categories() # TODO - def process_forum_categories(self) -> None: + def process_forum_category_metadata(self) -> None: logger.debug("Processing forum categories") directory = self.meta_path("forum", "category") with self.database.conn as cur: @@ -320,8 +321,12 @@ def process_forum_categories(self) -> None: forum_category_id_str, ext = os.path.splitext(path) forum_category_id = int(forum_category_id_str) - assert ext = ".json", "Extension for forum category metadata not JSON" + assert ext == ".json", "Extension for forum category metadata not JSON" path = os.path.join(directory, path) metadata = self.json(path) self.database.add_forum_category(cur, self.site_slug, metadata) + + def process_forum_categories(self) -> None: + # TODO + ... From 6977a6dd3f0370a9e9730e2e2f365e2d17751521 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 21:21:41 -0400 Subject: [PATCH 079/133] Remove last_posted_at. --- deepwell/importer/database.py | 4 +--- deepwell/importer/seed.sql | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 3432b50a8f..bf1346a586 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -323,7 +323,6 @@ def add_forum_category( title, description, last_user_id, - last_posted_at, thread_count, post_count, full_scan, @@ -331,7 +330,7 @@ def add_forum_category( version ) VALUES - (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT DO NOTHING """, @@ -341,7 +340,6 @@ def add_forum_category( metadata["title"], metadata["description"], metadata["lastUser"], - metadata["last"], metadata["threads"], metadata["posts"], metadata["full_scan"], diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 0ca145f264..b68299a0dc 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -83,7 +83,6 @@ CREATE TABLE forum_category ( title TEXT NOT NULL, description TEXT NOT NULL, last_user_id INTEGER NOT NULL REFERENCES user(user_id), - last_posted_at INTEGER NOT NULL, thread_count INTEGER NOT NULL, post_count INTEGER NOT NULL, full_scan INTEGER NOT NULL CHECK (full_scan IN (0, 1)), -- boolean From 6402983b33f2d4fef2478b4a831a5c76540a7608 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 21:25:47 -0400 Subject: [PATCH 080/133] Fix missing data. --- deepwell/importer/database.py | 6 +++--- deepwell/importer/seed.sql | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index bf1346a586..01f5450fde 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -339,9 +339,9 @@ def add_forum_category( site_slug, metadata["title"], metadata["description"], - metadata["lastUser"], - metadata["threads"], - metadata["posts"], + metadata.get("lastUser"), + metadata.get("threads"), + metadata.get("posts"), metadata["full_scan"], metadata["last_page"], metadata["version"], diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index b68299a0dc..0ea22f9186 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -82,9 +82,9 @@ CREATE TABLE forum_category ( site_slug TEXT NOT NULL REFERENCES site(site_slug), title TEXT NOT NULL, description TEXT NOT NULL, - last_user_id INTEGER NOT NULL REFERENCES user(user_id), - thread_count INTEGER NOT NULL, - post_count INTEGER NOT NULL, + last_user_id INTEGER REFERENCES user(user_id), + thread_count INTEGER, + post_count INTEGER, full_scan INTEGER NOT NULL CHECK (full_scan IN (0, 1)), -- boolean last_page INTEGER NOT NULL, version INTEGER NOT NULL @@ -96,9 +96,9 @@ CREATE TABLE forum_thread ( site_slug TEXT NOT NULL REFERENCES site(site_slug), title TEXT NOT NULL, description TEXT NOT NULL, - last_user_id INTEGER NOT NULL REFERENCES user(user_id), - thread_count INTEGER NOT NULL, - post_count INTEGER NOT NULL, + last_user_id INTEGER REFERENCES user(user_id), + thread_count INTEGER, + post_count INTEGER, full_scan INTEGER NOT NULL CHECK (full_scan IN (0, 1)), -- boolean last_page INTEGER NOT NULL, version INTEGER NOT NULL From 963cb6cfe4a34e6c9fb378bb9becedac586fc5ff Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 21:27:17 -0400 Subject: [PATCH 081/133] Handle missing directory. --- deepwell/importer/site.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 70a408cba2..5aacc1a39c 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -315,6 +315,11 @@ def process_forum(self) -> None: def process_forum_category_metadata(self) -> None: logger.debug("Processing forum categories") directory = self.meta_path("forum", "category") + + if not os.path.isdir(directory): + logger.warning("No forum category metadata directory") + return + with self.database.conn as cur: for path in os.listdir(directory): logger.debug("Processing forum category metadata from '%s'", path) From 82876c5127a8df6dbbe836552be57187b91e5f43 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 23:39:18 -0400 Subject: [PATCH 082/133] Start implementing forum ingestion. --- deepwell/importer/database.py | 76 +++++++++++++++++++++++++++++++++++ deepwell/importer/seed.sql | 18 ++++----- deepwell/importer/site.py | 31 ++++++++++++-- 3 files changed, 112 insertions(+), 13 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 01f5450fde..b82560cb64 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -2,6 +2,7 @@ import logging import os import sqlite3 +from typing import Optional from .wikicomma_config import SiteData from .utils import kangaroo_twelve, from_js_timestamp @@ -347,3 +348,78 @@ def add_forum_category( metadata["version"], ), ) + + def add_forum_thread(self, cur, forum_category_id: int, metadata: dict) -> None: + forum_thread_id = metadata["id"] + logger.info("Inserting forum thread ID %d", forum_thread_id) + + cur.execute( + """ + INSERT INTO forum_thread + ( + forum_thread_id, + forum_category_id, + title, + description, + created_at, + created_by, + post_count, + sticky, + locked, + version + ) + VALUES + (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT + DO NOTHING + """, + ( + forum_thread_id, + forum_category_id, + metadata["title"], + metadata["description"], + metadata["started"], + metadata["startedUser"], + metadata["postsNum"], + metadata["sticky"], + metadata["isLocked"], + metadata.get("version"), + ), + ) + + def add_forum_post( + self, + cur, + *, + forum_thread_id: int, + parent_post_id: Optional[int], + metadata: dict, + ) -> None: + forum_post_id = metadata["id"] + logger.info("Inserting forum post ID %d", forum_post_id) + + cur.execute( + """ + INSERT INTO forum_post + ( + forum_post_id, + forum_thread_id, + parent_post_id, + title, + created_at, + created_by + ) + VALUES + (?, ?, ?, ?, ?, ?, ?) + ON CONFLICT + DO NOTHING + """, + ( + forum_post_id, + forum_thread_id, + parent_post_id, + metadata["title"], + metadata["stamp"], + metadata["poster"], + ), + ) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 0ea22f9186..e97f5af568 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -93,25 +93,23 @@ CREATE TABLE forum_category ( CREATE TABLE forum_thread ( forum_thread_id INTEGER PRIMARY KEY, forum_category_id INTEGER NOT NULL REFERENCES forum_category(forum_category_id), - site_slug TEXT NOT NULL REFERENCES site(site_slug), title TEXT NOT NULL, description TEXT NOT NULL, - last_user_id INTEGER REFERENCES user(user_id), - thread_count INTEGER, - post_count INTEGER, - full_scan INTEGER NOT NULL CHECK (full_scan IN (0, 1)), -- boolean - last_page INTEGER NOT NULL, - version INTEGER NOT NULL + created_at INTEGER NOT NULL, + created_by INTEGER NOT NULL REFERENCES users(user_id), + post_count INTEGER NOT NULL, + sticky INTEGER NOT NULL CHECK (sticky IN (0, 1)), -- boolean + locked INTEGER NOT NULL CHECK (locked IN (0, 1)), -- boolean + version INTEGER ); CREATE TABLE forum_post ( forum_post_id INTEGER PRIMARY KEY, forum_thread_id INTEGER NOT NULL REFERENCES forum_thread(forum_thread_id), + parent_post_id INTEGER REFERENCES forum_post(forum_post_id), title TEXT NOT NULL, created_at INTEGER NOT NULL, - created_by INTEGER NOT NULL REFERENCES user(user_id), - edited_at INTEGER NOT NULL, - edited_by INTEGER NOT NULL REFERENCES user(user_id) + created_by INTEGER NOT NULL REFERENCES user(user_id) ); CREATE TABLE forum_post_revision ( diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 5aacc1a39c..6f56e6ac41 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -313,7 +313,7 @@ def process_forum(self) -> None: # TODO def process_forum_category_metadata(self) -> None: - logger.debug("Processing forum categories") + logger.debug("Processing forum categories (metadata)") directory = self.meta_path("forum", "category") if not os.path.isdir(directory): @@ -333,5 +333,30 @@ def process_forum_category_metadata(self) -> None: self.database.add_forum_category(cur, self.site_slug, metadata) def process_forum_categories(self) -> None: - # TODO - ... + logger.debug("Processing forum categories") + directory = self.meta_path("forum") + + if not os.path.isdir(directory): + logger.warning("No forum category parent directory") + return + + for path in os.listdir(directory): + logger.debug("Processing forum category directory '%s'", path) + + if path == "category": + # Special metadata directory, see above + continue + + forum_category_id = int(path) + directory = os.path.join(directory, path) + + with self.database.conn as cur: + for path in os.listdir(directory): + logger.debug("Processing forum thread directory '%s'", directory) + + path = os.path.join(directory, path) + metadata = self.json(path) + + self.database.add_forum_thread(cur, forum_category_id, metadata) + + # TODO handle posts, parents, revisions From 94567d50eec75836fd40645ca50742bf76d13290 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 29 Jun 2024 23:52:42 -0400 Subject: [PATCH 083/133] Fix invalid path formation bug. --- deepwell/importer/site.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 6f56e6ac41..9081c08c74 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -348,13 +348,13 @@ def process_forum_categories(self) -> None: continue forum_category_id = int(path) - directory = os.path.join(directory, path) + thread_directory = os.path.join(directory, path) with self.database.conn as cur: - for path in os.listdir(directory): - logger.debug("Processing forum thread directory '%s'", directory) + for path in os.listdir(thread_directory): + logger.debug("Processing forum thread directory '%s'", thread_directory) - path = os.path.join(directory, path) + path = os.path.join(thread_directory, path) metadata = self.json(path) self.database.add_forum_thread(cur, forum_category_id, metadata) From a5084dcd4fa602de1333e10d2eb951e2082b5a36 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sun, 30 Jun 2024 00:39:42 -0400 Subject: [PATCH 084/133] Rename forum ingestion methods. --- deepwell/importer/site.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 9081c08c74..e09f6c45aa 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -308,11 +308,10 @@ def process_files(self) -> None: def process_forum(self) -> None: logger.info("Ingesting forum data for site %s", self.site_slug) - self.process_forum_category_metadata() self.process_forum_categories() - # TODO + self.process_forum_data() - def process_forum_category_metadata(self) -> None: + def process_forum_categories(self) -> None: logger.debug("Processing forum categories (metadata)") directory = self.meta_path("forum", "category") @@ -332,7 +331,7 @@ def process_forum_category_metadata(self) -> None: metadata = self.json(path) self.database.add_forum_category(cur, self.site_slug, metadata) - def process_forum_categories(self) -> None: + def process_forum_data(self) -> None: logger.debug("Processing forum categories") directory = self.meta_path("forum") From 74c2c72c07615503ae306cd8ef89ef009bee6a49 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sun, 30 Jun 2024 01:35:00 -0400 Subject: [PATCH 085/133] Update schema SQL. --- deepwell/importer/seed.sql | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index e97f5af568..2fd28b2a4e 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -109,7 +109,9 @@ CREATE TABLE forum_post ( parent_post_id INTEGER REFERENCES forum_post(forum_post_id), title TEXT NOT NULL, created_at INTEGER NOT NULL, - created_by INTEGER NOT NULL REFERENCES user(user_id) + created_by INTEGER NOT NULL REFERENCES user(user_id), + edited_at INTEGER NOT NULL, + edited_by INTEGER NOT NULL REFERENCES user(user_id) ); CREATE TABLE forum_post_revision ( @@ -119,3 +121,8 @@ CREATE TABLE forum_post_revision ( created_at INTEGER NOT NULL, created_by INTEGER NOT NULL REFERENCES user(user_id) ); + +CREATE TABLE forum_post_revision_wikitext ( + forum_post_revision_id INTEGER PRIMARY KEY REFERENCES forum_post_revision(forum_post_revision_id), + wikitext_hash TEXT NOT NULL REFERENCES text(hex_hash) +); From 14f8895d4074150e37a53c7d5241001b1d62f9ed Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sun, 30 Jun 2024 01:35:25 -0400 Subject: [PATCH 086/133] Start process_post() method. --- deepwell/importer/site.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index e09f6c45aa..8294d5e5ea 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -4,7 +4,7 @@ import re from functools import cache from io import BytesIO -from typing import Tuple, Union +from typing import Optional, Tuple, Union from urllib.parse import unquote as percent_unquote from urllib.request import urlopen @@ -349,13 +349,24 @@ def process_forum_data(self) -> None: forum_category_id = int(path) thread_directory = os.path.join(directory, path) - with self.database.conn as cur: - for path in os.listdir(thread_directory): + for path in os.listdir(thread_directory): + with self.database.conn as cur: logger.debug("Processing forum thread directory '%s'", thread_directory) path = os.path.join(thread_directory, path) - metadata = self.json(path) - - self.database.add_forum_thread(cur, forum_category_id, metadata) - - # TODO handle posts, parents, revisions + thread_metadata = self.json(path) + + self.database.add_forum_thread(cur, forum_category_id, thread_metadata) + + for post in thread_metadata["posts"]: + self.process_post(cur, thread_id=thread_metadata["id"], parent_post_id=None, metadata=post) + + def process_post(self, cur, *, thread_id: int, parent_post_id: Optional[int], metadata: dict) -> None: + ... + post_id = metadata["id"] + self.database.add_forum_post(cur, ...) + # TODO handle posts, parents, revisions + for revision in metadata["revisions"]: + ... + for child_post in revision["children"]: + self.process_post(cur, thread_id=thread_id, parent_post_id=post_id, metadata=child_post) From cf2e4164048f2104983a4024ef68907cfb0cddac Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 14:36:49 -0400 Subject: [PATCH 087/133] Add edited fields to forum_post. --- deepwell/importer/database.py | 8 ++++++-- deepwell/importer/seed.sql | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index b82560cb64..875ad8fd23 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -407,10 +407,12 @@ def add_forum_post( parent_post_id, title, created_at, - created_by + created_by, + edited_at, + edited_by ) VALUES - (?, ?, ?, ?, ?, ?, ?) + (?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT DO NOTHING """, @@ -421,5 +423,7 @@ def add_forum_post( metadata["title"], metadata["stamp"], metadata["poster"], + metadata.get("lastEdit"), + metadata.get("lastEditBy"), ), ) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 2fd28b2a4e..9f6b707b3d 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -110,8 +110,8 @@ CREATE TABLE forum_post ( title TEXT NOT NULL, created_at INTEGER NOT NULL, created_by INTEGER NOT NULL REFERENCES user(user_id), - edited_at INTEGER NOT NULL, - edited_by INTEGER NOT NULL REFERENCES user(user_id) + edited_at INTEGER, + edited_by INTEGER REFERENCES user(user_id) ); CREATE TABLE forum_post_revision ( From 17de203a99a6e11aad9d1421be7dd3b0d60a22b9 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 16:34:19 -0400 Subject: [PATCH 088/133] Remove extra newline. --- deepwell/src/macros.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/deepwell/src/macros.rs b/deepwell/src/macros.rs index 52493cf287..48dd7ed965 100644 --- a/deepwell/src/macros.rs +++ b/deepwell/src/macros.rs @@ -37,7 +37,6 @@ macro_rules! str_write { /// This is done because the only failure mode for writing to a `String` /// would be insufficient memory, which would cause an abort anyways. /// -/// /// # See also /// * [`str_write!`](macro.str_write.html) macro_rules! str_writeln { From 28517e4afae20d565ca08fdcbdb682996bffc1f7 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 16:34:07 -0400 Subject: [PATCH 089/133] Implement recursive forum post data ingestion. --- deepwell/importer/database.py | 26 +++++++++++++++++ deepwell/importer/site.py | 54 ++++++++++++++++++++++++++++------- 2 files changed, 69 insertions(+), 11 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 875ad8fd23..c2bea764a0 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -427,3 +427,29 @@ def add_forum_post( metadata.get("lastEditBy"), ), ) + + def add_forum_post_revision(self, cur, post_id: int, metadata: dict) -> None: + revision_id = metadata["id"] + logger.info("Inserting forum post ID %d (revision ID %d)", post_id, revision_id) + + cur.execute( + """ + INSERT INTO forum_post_revision + ( + forum_post_revision_id, + forum_post_id, + title, + created_at, + created_by + ) + VALUES + (?, ?, ?, ?, ?) + """, + ( + revision_id, + post_id, + metadata["title"], + metadata["stamp"], + metadata["author"], + ), + ) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 8294d5e5ea..ceb183fee1 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -175,7 +175,7 @@ def run(self) -> None: id=self.site_id, ) self.process_pages() - self.process_files() + # self.process_files() XXX self.process_forum() def process_pages(self) -> None: @@ -351,22 +351,54 @@ def process_forum_data(self) -> None: for path in os.listdir(thread_directory): with self.database.conn as cur: - logger.debug("Processing forum thread directory '%s'", thread_directory) + logger.debug( + "Processing forum thread directory '%s'", + thread_directory, + ) path = os.path.join(thread_directory, path) thread_metadata = self.json(path) - self.database.add_forum_thread(cur, forum_category_id, thread_metadata) + self.database.add_forum_thread( + cur, + forum_category_id, + thread_metadata, + ) for post in thread_metadata["posts"]: - self.process_post(cur, thread_id=thread_metadata["id"], parent_post_id=None, metadata=post) + self.process_post( + cur, + thread_id=thread_metadata["id"], + metadata=post, + ) - def process_post(self, cur, *, thread_id: int, parent_post_id: Optional[int], metadata: dict) -> None: - ... + def process_post( + self, + cur, + *, + thread_id: int, + parent_post_id: Optional[int] = None, + metadata: dict, + ) -> None: + logger.info("Processing forum post in %d (parent %s)", thread_id, parent_post_id) post_id = metadata["id"] - self.database.add_forum_post(cur, ...) - # TODO handle posts, parents, revisions + self.database.add_forum_post( + cur, + forum_thread_id=thread_id, + parent_post_id=parent_post_id, + metadata=metadata, + ) + + logger.debug("Found %d children in forum post", len(metadata["children"])) + for child_post in metadata["children"]: + self.process_post( + cur, + thread_id=thread_id, + parent_post_id=post_id, + metadata=child_post, + ) + + logger.debug("Found %d revisions for forum post", len(metadata["revisions"])) + metadata["revisions"].sort(key=lambda d: d["id"]) for revision in metadata["revisions"]: - ... - for child_post in revision["children"]: - self.process_post(cur, thread_id=thread_id, parent_post_id=post_id, metadata=child_post) + self.database.add_forum_post_revision(cur, post_id, revision) From 1b72b17c89c0478071b6073ce7c5a2b348d370fc Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 16:37:01 -0400 Subject: [PATCH 090/133] Only process revision section if there's data. Slight speed-up by avoiding sorting and iteration if there's nothing. --- deepwell/importer/site.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index ceb183fee1..22eab95084 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -399,6 +399,8 @@ def process_post( ) logger.debug("Found %d revisions for forum post", len(metadata["revisions"])) - metadata["revisions"].sort(key=lambda d: d["id"]) - for revision in metadata["revisions"]: - self.database.add_forum_post_revision(cur, post_id, revision) + if metadata["revisions"]: + metadata["revisions"].sort(key=lambda d: d["id"]) + + for revision in metadata["revisions"]: + self.database.add_forum_post_revision(cur, post_id, revision) From d3c6dce20f0d7bfb09b2a70173baf009864e34a7 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 16:38:36 -0400 Subject: [PATCH 091/133] Removed debug comment line. --- deepwell/importer/site.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 22eab95084..aaeca5b8f9 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -175,7 +175,7 @@ def run(self) -> None: id=self.site_id, ) self.process_pages() - # self.process_files() XXX + self.process_files() self.process_forum() def process_pages(self) -> None: From f85d2822d463dbe5986bf92ea2f73fc8154c52c2 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 16:43:01 -0400 Subject: [PATCH 092/133] Insert blob records to SQLite database. And foreign key for ensuring consistency. --- deepwell/importer/database.py | 12 ++++++++++++ deepwell/importer/importer.py | 2 +- deepwell/importer/s3.py | 7 ++++++- deepwell/importer/seed.sql | 2 +- 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index c2bea764a0..504676c68d 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -266,6 +266,18 @@ def add_page_vote( ), ) + def add_blob(self, cur, hex_hash: str, length: int) -> None: + logger.info("Inserting blob record %s", hex_hash) + + cur.execute( + """ + INSERT INTO blob + (hex_hash, length) + VALUES (?, ?) + """, + (hex_hash, length), + ) + def add_file( self, cur, diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index dd75ba6baa..f099070cbb 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -32,7 +32,7 @@ def __init__( self.wikicomma_config = wikicomma_config self.wikicomma_directory = wikicomma_directory self.database = Database(sqlite_path, delete=delete_sqlite) - self.s3 = S3(aws_profile=aws_profile, bucket=s3_bucket) + self.s3 = S3(aws_profile=aws_profile, bucket=s3_bucket, database=self.database) def run(self) -> None: logger.info("Starting Wikicomma importer...") diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py index e07f11017c..5fb2567968 100644 --- a/deepwell/importer/s3.py +++ b/deepwell/importer/s3.py @@ -12,13 +12,15 @@ class S3: "session", "client", "bucket", + "database", ) - def __init__(self, *, aws_profile, bucket) -> None: + def __init__(self, *, aws_profile, bucket, database) -> None: self.aws_profile = aws_profile self.session = boto3.Session(profile_name=aws_profile) self.client = self.session.client("s3") self.bucket = bucket + self.database = database def exists(self, s3_path: str) -> bool: try: @@ -49,4 +51,7 @@ def upload(self, file_path: str) -> str: ContentLength=len(data), ) + with self.database.conn as cur: + self.database.add_blob(cur, s3_path, len(data)) + return s3_path diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 9f6b707b3d..780a6c4821 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -74,7 +74,7 @@ CREATE TABLE file ( page_id INTEGER NOT NULL REFERENCES page(page_id), site_slug TEXT NOT NULL REFERENCES site(site_slug), filename TEXT NOT NULL, - s3_hash TEXT NOT NULL + s3_hash TEXT NOT NULL REFERENCES blob(hex_hash) ); CREATE TABLE forum_category ( From 4df6c2a04dda73052c596a7a0df60c025da283dc Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 16:48:18 -0400 Subject: [PATCH 093/133] Store MIME type in SQLite too. For the mime_hint column in Wikijump. --- deepwell/importer/database.py | 13 ++++++++----- deepwell/importer/requirements.txt | 3 ++- deepwell/importer/s3.py | 2 +- deepwell/importer/seed.sql | 1 + 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 504676c68d..2ede77fed8 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -7,6 +7,8 @@ from .wikicomma_config import SiteData from .utils import kangaroo_twelve, from_js_timestamp +import magic + logger = logging.getLogger(__name__) @@ -266,16 +268,17 @@ def add_page_vote( ), ) - def add_blob(self, cur, hex_hash: str, length: int) -> None: - logger.info("Inserting blob record %s", hex_hash) + def add_blob(self, cur, data: bytes, hex_hash: str) -> None: + mime = magic.from_buffer(data, mime=True) + logger.info("Inserting blob record, MIME type '%s'", mime) cur.execute( """ INSERT INTO blob - (hex_hash, length) - VALUES (?, ?) + (hex_hash, mime, length) + VALUES (?, ?, ?) """, - (hex_hash, length), + (hex_hash, mime, len(data)), ) def add_file( diff --git a/deepwell/importer/requirements.txt b/deepwell/importer/requirements.txt index 5c14644825..e526254bd6 100644 --- a/deepwell/importer/requirements.txt +++ b/deepwell/importer/requirements.txt @@ -1,3 +1,4 @@ boto3>=1.34.0 -pycryptodome>=3.20.0 py7zr>=0.21.0 +pycryptodome>=3.20.0 +python-magic>=0.4.0 diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py index 5fb2567968..1c9071a3e5 100644 --- a/deepwell/importer/s3.py +++ b/deepwell/importer/s3.py @@ -52,6 +52,6 @@ def upload(self, file_path: str) -> str: ) with self.database.conn as cur: - self.database.add_blob(cur, s3_path, len(data)) + self.database.add_blob(cur, data, s3_path) return s3_path diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 780a6c4821..f4efbfc950 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -1,5 +1,6 @@ CREATE TABLE blob ( hex_hash TEXT PRIMARY KEY, + mime TEXT NOT NULL, length INTEGER NOT NULL ); From 210b74177c0b10f0df7e75318cc2842fd7c49f70 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 18:30:33 -0400 Subject: [PATCH 094/133] Initial addition of methods for forum wikitexts. --- deepwell/importer/database.py | 40 +++++++++++++++++++++++++ deepwell/importer/seed.sql | 5 ++++ deepwell/importer/site.py | 56 ++++++++++++++++++++++++++++++++++- 3 files changed, 100 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 2ede77fed8..f8f8379bbf 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -468,3 +468,43 @@ def add_forum_post_revision(self, cur, post_id: int, metadata: dict) -> None: metadata["author"], ), ) + + def add_forum_post_wikitext(self, cur, forum_post_id: int, contents: str): + logger.info("Inserting forum post wikitext for ID %d", forum_post_id) + hex_hash = self.add_text(cur, contents) + + cur.execute( + """ + INSERT INTO forum_post_wikitext + ( + forum_post_id, + wikitext_hash + ) + VALUES + (?, ?) + ON CONFLICT + DO UPDATE + SET wikitext_hash = ? + """, + (forum_post_id, hex_hash, hex_hash), + ) + + def add_forum_post_revision_wikitext(self, cur, forum_post_revision_id: int, contents: str): + logger.info("Inserting forum post revision wikitext for ID %d", forum_post_revision_id) + hex_hash = self.add_text(cur, contents) + + cur.execute( + """ + INSERT INTO forum_post_revision_wikitext + ( + forum_post_revision_id, + wikitext_hash + ) + VALUES + (?, ?) + ON CONFLICT + DO UPDATE + SET wikitext_hash = ? + """, + (forum_post_revision_id, hex_hash, hex_hash), + ) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index f4efbfc950..8eb73f5a59 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -115,6 +115,11 @@ CREATE TABLE forum_post ( edited_by INTEGER REFERENCES user(user_id) ); +CREATE TABLE forum_post_wikitext ( + forum_post_id INTEGER PRIMARY KEY REFERENCES forum_post(forum_post_id), + wikitext_hash TEXT NOT NULL REFERENCES text(hex_hash) +); + CREATE TABLE forum_post_revision ( forum_post_revision_id INTEGER PRIMARY KEY, forum_post_id INTEGER NOT NULL REFERENCES forum_post(forum_post_id), diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index aaeca5b8f9..0e10b16d0b 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -274,7 +274,7 @@ def process_page_revisions_wikitext( # Get revision ID revision_id = self.get_revision_id(cur, page_id, revision_number) - # Converting from binary, mostly to ensure it's UTF-8 + # Convert from binary, mostly to ensure it's UTF-8 contents = buf.read().decode("utf-8") # Run ingestion for this revision @@ -310,6 +310,7 @@ def process_forum(self) -> None: logger.info("Ingesting forum data for site %s", self.site_slug) self.process_forum_categories() self.process_forum_data() + self.process_forum_wikitext() def process_forum_categories(self) -> None: logger.debug("Processing forum categories (metadata)") @@ -404,3 +405,56 @@ def process_post( for revision in metadata["revisions"]: self.database.add_forum_post_revision(cur, post_id, revision) + + def process_forum_wikitext(self) -> None: + logger.info("Ingesting forum wikitext for site %s", self.site_slug) + + # Each forum category + for category_id_str in os.listdir(self.forum_dir): + logger.debug("Processing forum wikitext for category ID %s", category_id_str) + category_id = int(category_id_str) + directory = os.path.join(self.forum_dir, category_id_str) + + # Each forum thread + for path in os.listdir(directory): + thread_id_str, ext = os.path.splitext(path) + assert ext == ".7z", "Extension for forum wikitexts not 7z" + path = os.path.join(directory, path) + + thread_id = int(thread_id_str) + logger.debug("Processing forum wikitext for thread ID %s", thread_id_str) + + # Extract page sources for each post and revision + with py7zr.SevenZipFile(path, "r") as archive: + sources = archive.readall() + + # Convert and begin adding to the database + self.process_forum_revisions_wikitext(thread_id, sources) + + def process_forum_revisions_wikitext( + self, + thread_id: int, + sources: dict[str, BytesIO], + ) -> None: + logger.debug("Ingesting %d forum thread revision wikitexts", len(sources)) + + with self.database.conn as cur: + for path, buf in sources.items(): + post_id_str, filename = os.path.split(path) + revision, ext = os.path.splitext(filename) + assert ext == ".html", "Extension for forum revision HTML not html" + post_id = int(post_id_str) + + # Convert from binary, mostly to ensure it's UTF-8 + contents = buf.read().decode("utf-8") + + # This is kind of a mess because we don't have + # forum post revision IDs for the latest revision. :( + + # Per-post wikitext + if revision == "latest": + self.database.add_forum_post_revision(cur, post_id, contents) + # Per-revision wikitext + else: + revision_id = int(revision) + self.database.add_forum_post_revision_wikitext(cur, revision_id, contents) From 18eecb8135e0447cd3caf4fc3267f1b5c756cefd Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 18:31:58 -0400 Subject: [PATCH 095/133] Remove extra whitespace from HTML. --- deepwell/importer/site.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 0e10b16d0b..dc4c959661 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -446,7 +446,7 @@ def process_forum_revisions_wikitext( post_id = int(post_id_str) # Convert from binary, mostly to ensure it's UTF-8 - contents = buf.read().decode("utf-8") + contents = buf.read().decode("utf-8").strip() # This is kind of a mess because we don't have # forum post revision IDs for the latest revision. :( From 8867a2598d63d4549b68fd696106eebaac6dab1b Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 18:36:23 -0400 Subject: [PATCH 096/133] Fix issues. --- deepwell/importer/database.py | 2 +- deepwell/importer/site.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index f8f8379bbf..cbad67c29e 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -470,7 +470,7 @@ def add_forum_post_revision(self, cur, post_id: int, metadata: dict) -> None: ) def add_forum_post_wikitext(self, cur, forum_post_id: int, contents: str): - logger.info("Inserting forum post wikitext for ID %d", forum_post_id) + logger.info("Inserting latest forum post wikitext for ID %d", forum_post_id) hex_hash = self.add_text(cur, contents) cur.execute( diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index dc4c959661..62881ede70 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -453,7 +453,7 @@ def process_forum_revisions_wikitext( # Per-post wikitext if revision == "latest": - self.database.add_forum_post_revision(cur, post_id, contents) + self.database.add_forum_post_wikitext(cur, post_id, contents) # Per-revision wikitext else: revision_id = int(revision) From 763dd8433da314305e129e47b8919957e4072a92 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 18:44:46 -0400 Subject: [PATCH 097/133] Skip missing forum directory. This happens if there are no posts. --- deepwell/importer/site.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 62881ede70..45e22faa27 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -409,6 +409,10 @@ def process_post( def process_forum_wikitext(self) -> None: logger.info("Ingesting forum wikitext for site %s", self.site_slug) + if not os.path.isdir(self.forum_dir): + logger.warning("No forum directory for site") + return + # Each forum category for category_id_str in os.listdir(self.forum_dir): logger.debug("Processing forum wikitext for category ID %s", category_id_str) From 8b70757ca9be55df23fa36d3ef5b7f8735b68413 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 18:51:55 -0400 Subject: [PATCH 098/133] Add debug line for _users 'site'. --- deepwell/importer/importer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index f099070cbb..342f4d40b4 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -65,7 +65,7 @@ def process_sites(self) -> None: for site_descr in os.listdir(self.wikicomma_directory): if site_descr == "_users": - logger.debug("Skipping user list") + logger.debug("Skipping user list, not a site") continue elif site_descr.endswith(".torrent"): logger.debug("Skipping torrent file from Wikicomma sync") From 1ecef2b2191cf9de2027b987d4ee71971ef155d4 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 18:53:28 -0400 Subject: [PATCH 099/133] Run black formatter. --- deepwell/importer/database.py | 12 ++++++++++-- deepwell/importer/site.py | 22 ++++++++++++++++++---- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index cbad67c29e..a7717766f1 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -489,8 +489,16 @@ def add_forum_post_wikitext(self, cur, forum_post_id: int, contents: str): (forum_post_id, hex_hash, hex_hash), ) - def add_forum_post_revision_wikitext(self, cur, forum_post_revision_id: int, contents: str): - logger.info("Inserting forum post revision wikitext for ID %d", forum_post_revision_id) + def add_forum_post_revision_wikitext( + self, + cur, + forum_post_revision_id: int, + contents: str, + ): + logger.info( + "Inserting forum post revision wikitext for ID %d", + forum_post_revision_id, + ) hex_hash = self.add_text(cur, contents) cur.execute( diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 45e22faa27..0fb1c88748 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -381,7 +381,11 @@ def process_post( parent_post_id: Optional[int] = None, metadata: dict, ) -> None: - logger.info("Processing forum post in %d (parent %s)", thread_id, parent_post_id) + logger.info( + "Processing forum post in %d (parent %s)", + thread_id, + parent_post_id, + ) post_id = metadata["id"] self.database.add_forum_post( cur, @@ -415,7 +419,10 @@ def process_forum_wikitext(self) -> None: # Each forum category for category_id_str in os.listdir(self.forum_dir): - logger.debug("Processing forum wikitext for category ID %s", category_id_str) + logger.debug( + "Processing forum wikitext for category ID %s", + category_id_str, + ) category_id = int(category_id_str) directory = os.path.join(self.forum_dir, category_id_str) @@ -426,7 +433,10 @@ def process_forum_wikitext(self) -> None: path = os.path.join(directory, path) thread_id = int(thread_id_str) - logger.debug("Processing forum wikitext for thread ID %s", thread_id_str) + logger.debug( + "Processing forum wikitext for thread ID %s", + thread_id_str, + ) # Extract page sources for each post and revision with py7zr.SevenZipFile(path, "r") as archive: @@ -461,4 +471,8 @@ def process_forum_revisions_wikitext( # Per-revision wikitext else: revision_id = int(revision) - self.database.add_forum_post_revision_wikitext(cur, revision_id, contents) + self.database.add_forum_post_revision_wikitext( + cur, + revision_id, + contents, + ) From 8a081f6230dcf702c0d0ed82f3c8dac9ea4cd24a Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 18:54:55 -0400 Subject: [PATCH 100/133] Add explanatory note on table. --- deepwell/importer/seed.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 8eb73f5a59..d8b4b8f24b 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -115,6 +115,7 @@ CREATE TABLE forum_post ( edited_by INTEGER REFERENCES user(user_id) ); +-- For the latest post revision's wikitext CREATE TABLE forum_post_wikitext ( forum_post_id INTEGER PRIMARY KEY REFERENCES forum_post(forum_post_id), wikitext_hash TEXT NOT NULL REFERENCES text(hex_hash) From 9a20bf9e2896f5f57406963a31572ecec3b39ef6 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 19:03:03 -0400 Subject: [PATCH 101/133] Update message again. --- deepwell/importer/importer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/importer.py b/deepwell/importer/importer.py index 342f4d40b4..771af5af10 100644 --- a/deepwell/importer/importer.py +++ b/deepwell/importer/importer.py @@ -65,7 +65,7 @@ def process_sites(self) -> None: for site_descr in os.listdir(self.wikicomma_directory): if site_descr == "_users": - logger.debug("Skipping user list, not a site") + logger.debug("Skipping '_users', not a site") continue elif site_descr.endswith(".torrent"): logger.debug("Skipping torrent file from Wikicomma sync") From 514b5b610cff92732d7ae6ec6731e0cd756d8ec0 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 19:22:56 -0400 Subject: [PATCH 102/133] Fix add_page_vote(). --- deepwell/importer/database.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index a7717766f1..6f8d4d1912 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -237,13 +237,13 @@ def add_page_vote( *, page_id: int, user_id: int, - vote_value: int, + value: int, ) -> None: logger.info( "Inserting page vote for page ID %d / user ID %d (value %d)", page_id, user_id, - vote_value, + value, ) cur.execute( @@ -263,8 +263,8 @@ def add_page_vote( ( page_id, user_id, - vote_value, - vote_value, + value, + value, ), ) From 0b05279fedb3997319e747f4c03d1d11778fb66b Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 20:19:14 -0400 Subject: [PATCH 103/133] Pass in file_metadata and store it. --- deepwell/importer/database.py | 10 +++------- deepwell/importer/requirements.txt | 1 - deepwell/importer/s3.py | 4 ++-- deepwell/importer/site.py | 16 ++++++++++++---- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 6f8d4d1912..9389b3126c 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -7,8 +7,6 @@ from .wikicomma_config import SiteData from .utils import kangaroo_twelve, from_js_timestamp -import magic - logger = logging.getLogger(__name__) @@ -268,17 +266,15 @@ def add_page_vote( ), ) - def add_blob(self, cur, data: bytes, hex_hash: str) -> None: - mime = magic.from_buffer(data, mime=True) - - logger.info("Inserting blob record, MIME type '%s'", mime) + def add_blob(self, cur, *, hex_hash: str, length: int, mime: str) -> None: + logger.debug("Inserting blob record") cur.execute( """ INSERT INTO blob (hex_hash, mime, length) VALUES (?, ?, ?) """, - (hex_hash, mime, len(data)), + (hex_hash, mime, length), ) def add_file( diff --git a/deepwell/importer/requirements.txt b/deepwell/importer/requirements.txt index e526254bd6..46eed78c90 100644 --- a/deepwell/importer/requirements.txt +++ b/deepwell/importer/requirements.txt @@ -1,4 +1,3 @@ boto3>=1.34.0 py7zr>=0.21.0 pycryptodome>=3.20.0 -python-magic>=0.4.0 diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py index 1c9071a3e5..f68212e6c5 100644 --- a/deepwell/importer/s3.py +++ b/deepwell/importer/s3.py @@ -32,7 +32,7 @@ def exists(self, s3_path: str) -> bool: except: return False - def upload(self, file_path: str) -> str: + def upload(self, file_path: str, mime: str) -> str: with open(file_path, "rb") as file: data = file.read() # files use SHA256, text uses K12 @@ -52,6 +52,6 @@ def upload(self, file_path: str) -> str: ) with self.database.conn as cur: - self.database.add_blob(cur, data, s3_path) + self.database.add_blob(cur, hex_hash=s3_path, length=len(data), mime=mime) return s3_path diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 0fb1c88748..421a3074dd 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -27,6 +27,7 @@ class SiteImporter: "site_slug", "site_url", "site_id", + "file_metadata", ) def __init__( @@ -46,6 +47,7 @@ def __init__( self.site_slug = site_slug self.site_url = site_url self.site_id = self.get_site_id(site_url) + self.file_metadata = {} @cache def get_site_id(self, site_url: str) -> int: @@ -204,7 +206,12 @@ def process_page_metadata(self) -> None: page_descr=page_descr, metadata=metadata, ) + page_id = metadata["page_id"] + for file_metadata in metadata.get("files", ()): + file_id = file_metadata["file_id"] + self.file_metadata[file_id] = file_metadata + self.process_page_revisions_metadata( cur, page_id, @@ -285,17 +292,18 @@ def process_files(self) -> None: mapping = self.json(self.meta_path("file_map.json")) with self.database.conn as cur: - for file_id, entry in mapping.items(): - file_id = int(file_id) + for file_id_str, entry in mapping.items(): + file_id = int(file_id_str) wikidot_url = entry["url"] + file_metadata = self.file_metadata[file_id] logger.debug("Processing file stored at %s", wikidot_url) page_slug_url, filename = os.path.split(entry["path"]) page_slug = percent_unquote(page_slug_url) page_id = self.get_page_id(page_slug=page_slug) - path = os.path.join(self.file_dir, page_slug_url, str(file_id)) - s3_hash = self.s3.upload(path) + path = os.path.join(self.file_dir, page_slug_url, file_id_str) + s3_hash = self.s3.upload(path, file_metadata["mime"]) self.database.add_file( cur, From 6564e9364254d9d1d5d3eaf19136009e985bf1f6 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 20:19:43 -0400 Subject: [PATCH 104/133] Change database commit order. Save the data more frequently. --- deepwell/importer/site.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 421a3074dd..3245d1300d 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -187,8 +187,8 @@ def process_pages(self) -> None: def process_page_metadata(self) -> None: logger.info("Ingesting page revision metadata for site %s", self.site_slug) meta_directory = self.meta_path("pages") - with self.database.conn as cur: - for path in os.listdir(meta_directory): + for path in os.listdir(meta_directory): + with self.database.conn as cur: logger.debug("Processing page metadata from '%s'", path) # NOTE: Usually page_slug is the same as page_descr, but if From a1f9eaae20034a9743469b4df6821ba2c18b1fa2 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 20:27:37 -0400 Subject: [PATCH 105/133] Add plus sign to vote values. --- deepwell/importer/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 9389b3126c..3399c57b88 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -238,7 +238,7 @@ def add_page_vote( value: int, ) -> None: logger.info( - "Inserting page vote for page ID %d / user ID %d (value %d)", + "Inserting page vote for page ID %d / user ID %d (value %+d)", page_id, user_id, value, From b537368b2b66216b6eeac4d5378503c6201e45d7 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 21:08:05 -0400 Subject: [PATCH 106/133] Add logic to delete/re-insert pages with multiples. --- deepwell/importer/database.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 3399c57b88..171cbd3526 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -144,6 +144,34 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N page_descr, ) + page_id = metadata["page_id"] + sitemap_updated_at = metadata["sitemap_update"] // 1000 + + # If a page has been moved, it can leave multiple entries. + # We want the most recent page if we find such entries. + result = cur.execute( + """ + SELECT sitemap_updated_at + FROM page + WHERE page_id = ? + AND site_slug = ? + """, + (page_id, site_slug), + ).fetchone() + if result is not None: + (last_sitemap_updated_at,) = result + if last_sitemap_updated_at > sitemap_updated_at: + logger.warning("Found updated version of page ID %d, deleting previous", page_id) + cur.execute( + """ + DELETE FROM page + WHERE page_id = ? + AND site_slug = ? + """, + (page_id, site_slug), + ) + + # Insert new page cur.execute( """ INSERT INTO page @@ -159,15 +187,13 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) - ON CONFLICT - DO NOTHING """, ( - metadata["page_id"], + page_id, page_descr, metadata["name"], site_slug, - metadata["sitemap_update"] // 1000, + sitemap_updated_at, metadata.get("title", ""), metadata["is_locked"], json.dumps(metadata.get("tags", [])), From d129f42b6c7f28610fe75c9ee5ca1288529f200a Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 21:11:22 -0400 Subject: [PATCH 107/133] Fix comparison. --- deepwell/importer/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 171cbd3526..ac4c567709 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -160,7 +160,7 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N ).fetchone() if result is not None: (last_sitemap_updated_at,) = result - if last_sitemap_updated_at > sitemap_updated_at: + if last_sitemap_updated_at < sitemap_updated_at: logger.warning("Found updated version of page ID %d, deleting previous", page_id) cur.execute( """ From e4c3dc099b951097155065fa06e513680215cd6d Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 21:12:43 -0400 Subject: [PATCH 108/133] Handle other case with comparison. --- deepwell/importer/database.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index ac4c567709..e2e8545d82 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -170,6 +170,9 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N """, (page_id, site_slug), ) + else: + logger.warning("Found another version of page ID, looks newer, skipping", page_id) + return # Insert new page cur.execute( From 9bcea0b7277bede017d78e247fc56922c50978cd Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 21:25:52 -0400 Subject: [PATCH 109/133] Add rows to new page_deleted table. --- deepwell/importer/database.py | 19 ++++++++++++++++--- deepwell/importer/seed.sql | 8 ++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index e2e8545d82..cb86529aaa 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -151,7 +151,7 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N # We want the most recent page if we find such entries. result = cur.execute( """ - SELECT sitemap_updated_at + SELECT page_descr, sitemap_updated_at FROM page WHERE page_id = ? AND site_slug = ? @@ -159,9 +159,9 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N (page_id, site_slug), ).fetchone() if result is not None: - (last_sitemap_updated_at,) = result + (prior_page_descr, last_sitemap_updated_at) = result if last_sitemap_updated_at < sitemap_updated_at: - logger.warning("Found updated version of page ID %d, deleting previous", page_id) + logger.warning("Found updated version of page ID %d, deleting previous '%s'", page_id, prior_page_descr) cur.execute( """ DELETE FROM page @@ -170,6 +170,19 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N """, (page_id, site_slug), ) + cur.execute( + """ + INSERT INTO page_deleted + ( + page_descr, + site_slug, + page_id + ) + VALUES + (?, ?, ?) + """, + (prior_page_descr, site_slug, page_id), + ) else: logger.warning("Found another version of page ID, looks newer, skipping", page_id) return diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index d8b4b8f24b..3f33157bd3 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -45,6 +45,14 @@ CREATE TABLE page ( UNIQUE (site_slug, page_slug) ); +CREATE TABLE page_deleted ( + page_descr TEXT, + site_slug TEXT, + page_id INTEGER NOT NULL + + PRIMARY KEY (site_slug, page_descr) +); + CREATE TABLE page_revision ( revision_id INTEGER PRIMARY KEY, revision_number INTEGER NOT NULL CHECK (revision_number >= 0), From 1c22dc9692c6b99f9d900b0c9f605cf172944598 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 21:45:56 -0400 Subject: [PATCH 110/133] Add method for is_deleted_page(). --- deepwell/importer/database.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index cb86529aaa..3d58cd964c 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -216,6 +216,20 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N ), ) + def is_deleted_page(self, cur, *, page_descr: str, site_slug: str) -> bool: + logger.debug("Checking if page descr %s exists in site %s", page_descr, site_slug) + + result = cur.execute( + """ + SELECT * + FROM page_deleted + WHERE page_descr = ? + AND site_slug = ? + """, + (page_descr, site_slug), + ).fetchone() + return result is not None + def add_page_revision_metadata(self, cur, page_id: int, data: dict) -> None: logger.info( "Inserting page revision %d for page ID %d", From 028fb165a091203ffdf9929b0080201400ead582 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 21:54:29 -0400 Subject: [PATCH 111/133] Add logic to read and skip deleted pages. --- deepwell/importer/database.py | 22 ++++++++++++---------- deepwell/importer/site.py | 4 ++++ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 3d58cd964c..23844a2910 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -216,18 +216,20 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N ), ) - def is_deleted_page(self, cur, *, page_descr: str, site_slug: str) -> bool: + def is_deleted_page(self, *, page_descr: str, site_slug: str) -> bool: logger.debug("Checking if page descr %s exists in site %s", page_descr, site_slug) - result = cur.execute( - """ - SELECT * - FROM page_deleted - WHERE page_descr = ? - AND site_slug = ? - """, - (page_descr, site_slug), - ).fetchone() + with self.conn as cur: + result = cur.execute( + """ + SELECT * + FROM page_deleted + WHERE page_descr = ? + AND site_slug = ? + """, + (page_descr, site_slug), + ).fetchone() + return result is not None def add_page_revision_metadata(self, cur, page_id: int, data: dict) -> None: diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 3245d1300d..f89f37662c 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -259,6 +259,10 @@ def process_page_wikitext(self) -> None: with py7zr.SevenZipFile(path, "r") as archive: sources = archive.readall() + if self.database.is_deleted_page(page_descr=page_descr, site_slug=self.site_slug): + logger.warning("Page descr '%s' was previously deleted, skipping", page_descr) + continue + page_id = self.get_page_id(page_descr=page_descr) # Convert and begin adding to the database self.process_page_revisions_wikitext(page_id, sources) From 1ae7a370f2c9d7563e953a344088d38b84b4357c Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 21:56:03 -0400 Subject: [PATCH 112/133] Fix seed syntax. --- deepwell/importer/seed.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 3f33157bd3..4901207d6c 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -48,7 +48,7 @@ CREATE TABLE page ( CREATE TABLE page_deleted ( page_descr TEXT, site_slug TEXT, - page_id INTEGER NOT NULL + page_id INTEGER NOT NULL, PRIMARY KEY (site_slug, page_descr) ); From 3ff01f6b8dfa64f5fd05e9fe95854679854cdcea Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 21:57:29 -0400 Subject: [PATCH 113/133] Return result of is_deleted_page(). --- deepwell/importer/database.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 23844a2910..fb7e3d9260 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -217,8 +217,6 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N ) def is_deleted_page(self, *, page_descr: str, site_slug: str) -> bool: - logger.debug("Checking if page descr %s exists in site %s", page_descr, site_slug) - with self.conn as cur: result = cur.execute( """ @@ -230,7 +228,9 @@ def is_deleted_page(self, *, page_descr: str, site_slug: str) -> bool: (page_descr, site_slug), ).fetchone() - return result is not None + exists = result is not None + logger.debug("Checking if page descr %s exists in site %s: %s", page_descr, site_slug, exists) + return exists def add_page_revision_metadata(self, cur, page_id: int, data: dict) -> None: logger.info( From 11335d44595e4e191af3fd924cbe9f15fd3d903b Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 22:05:55 -0400 Subject: [PATCH 114/133] Add separate method for adding deleted pages. --- deepwell/importer/database.py | 63 ++++++++++++++++++++++++++--------- deepwell/importer/s3.py | 7 +++- deepwell/importer/site.py | 10 ++++-- 3 files changed, 62 insertions(+), 18 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index fb7e3d9260..4fbc930e68 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -161,7 +161,11 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N if result is not None: (prior_page_descr, last_sitemap_updated_at) = result if last_sitemap_updated_at < sitemap_updated_at: - logger.warning("Found updated version of page ID %d, deleting previous '%s'", page_id, prior_page_descr) + logger.warning( + "Found updated version of page ID %d, deleting previous '%s'", + page_id, + prior_page_descr, + ) cur.execute( """ DELETE FROM page @@ -170,21 +174,17 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N """, (page_id, site_slug), ) - cur.execute( - """ - INSERT INTO page_deleted - ( - page_descr, - site_slug, - page_id - ) - VALUES - (?, ?, ?) - """, - (prior_page_descr, site_slug, page_id), + self.add_deleted_page( + cur, + page_descr=page_descr, + site_slug=site_slug, + page_id=page_id, ) else: - logger.warning("Found another version of page ID, looks newer, skipping", page_id) + logger.warning( + "Found another version of page ID, looks newer, skipping", + page_id, + ) return # Insert new page @@ -216,6 +216,34 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N ), ) + def add_deleted_page( + self, + cur, + *, + page_descr: str, + site_slug: str, + page_id: int, + ) -> None: + logger.debug( + "Adding deleted page: %s / %s (%d)", + page_descr, + site_slug, + page_id, + ) + cur.execute( + """ + INSERT INTO page_deleted + ( + page_descr, + site_slug, + page_id + ) + VALUES + (?, ?, ?) + """, + (prior_page_descr, site_slug, page_id), + ) + def is_deleted_page(self, *, page_descr: str, site_slug: str) -> bool: with self.conn as cur: result = cur.execute( @@ -229,7 +257,12 @@ def is_deleted_page(self, *, page_descr: str, site_slug: str) -> bool: ).fetchone() exists = result is not None - logger.debug("Checking if page descr %s exists in site %s: %s", page_descr, site_slug, exists) + logger.debug( + "Checking if page descr %s exists in site %s: %s", + page_descr, + site_slug, + exists, + ) return exists def add_page_revision_metadata(self, cur, page_id: int, data: dict) -> None: diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py index f68212e6c5..ba841ae097 100644 --- a/deepwell/importer/s3.py +++ b/deepwell/importer/s3.py @@ -52,6 +52,11 @@ def upload(self, file_path: str, mime: str) -> str: ) with self.database.conn as cur: - self.database.add_blob(cur, hex_hash=s3_path, length=len(data), mime=mime) + self.database.add_blob( + cur, + hex_hash=s3_path, + length=len(data), + mime=mime, + ) return s3_path diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index f89f37662c..53f541e2d3 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -259,8 +259,14 @@ def process_page_wikitext(self) -> None: with py7zr.SevenZipFile(path, "r") as archive: sources = archive.readall() - if self.database.is_deleted_page(page_descr=page_descr, site_slug=self.site_slug): - logger.warning("Page descr '%s' was previously deleted, skipping", page_descr) + if self.database.is_deleted_page( + page_descr=page_descr, + site_slug=self.site_slug, + ): + logger.warning( + "Page descr '%s' was previously deleted, skipping", + page_descr, + ) continue page_id = self.get_page_id(page_descr=page_descr) From f6d62ca29439367132e5dea48f338a59036948ca Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 22:21:05 -0400 Subject: [PATCH 115/133] Add deleted page for other branch. --- deepwell/importer/database.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 4fbc930e68..a3331f8ff9 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -182,9 +182,15 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N ) else: logger.warning( - "Found another version of page ID, looks newer, skipping", + "Found another version of page ID %d, looks newer, skipping", page_id, ) + self.add_deleted_page( + cur, + page_descr=prior_page_descr, + site_slug=site_slug, + page_id=page_id, + ) return # Insert new page From 5e0af3cf34ce90bbfe9826dbb3bf49c0d6672769 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 22:22:24 -0400 Subject: [PATCH 116/133] Fix deletion logic. --- deepwell/importer/database.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index a3331f8ff9..a80232b4f7 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -176,7 +176,7 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N ) self.add_deleted_page( cur, - page_descr=page_descr, + page_descr=prior_page_descr, site_slug=site_slug, page_id=page_id, ) @@ -187,7 +187,7 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N ) self.add_deleted_page( cur, - page_descr=prior_page_descr, + page_descr=page_descr, site_slug=site_slug, page_id=page_id, ) From 15ce77023fdd414f7944324bcf183eca402066b8 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 22:23:58 -0400 Subject: [PATCH 117/133] Fix argument. --- deepwell/importer/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index a80232b4f7..02dbe03f62 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -247,7 +247,7 @@ def add_deleted_page( VALUES (?, ?, ?) """, - (prior_page_descr, site_slug, page_id), + (page_descr, site_slug, page_id), ) def is_deleted_page(self, *, page_descr: str, site_slug: str) -> bool: From c435bec2201f06357b98ca2bd006264bd6315138 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 22:25:56 -0400 Subject: [PATCH 118/133] Log updated fields. --- deepwell/importer/database.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 02dbe03f62..b270dc77cf 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -162,9 +162,11 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N (prior_page_descr, last_sitemap_updated_at) = result if last_sitemap_updated_at < sitemap_updated_at: logger.warning( - "Found updated version of page ID %d, deleting previous '%s'", + "Found updated version of page ID %d, deleting previous '%s' (%d < %d)", page_id, prior_page_descr, + last_sitemap_updated_at, + sitemap_updated_at, ) cur.execute( """ @@ -182,8 +184,10 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N ) else: logger.warning( - "Found another version of page ID %d, looks newer, skipping", + "Found another version of page ID %d, looks newer, skipping (%d ≥ %d)", page_id, + last_sitemap_updated_at, + sitemap_updated_at, ) self.add_deleted_page( cur, From 273e9ae0344da722b3b8a3a63902cf2ca1743600 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 22:47:38 -0400 Subject: [PATCH 119/133] Modify exists() method for checking database and S3. --- deepwell/importer/database.py | 15 +++++++++++++++ deepwell/importer/s3.py | 8 +++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index b270dc77cf..a062093412 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -378,6 +378,21 @@ def add_blob(self, cur, *, hex_hash: str, length: int, mime: str) -> None: (hex_hash, mime, length), ) + def blob_exists(self, hex_hash: str) -> bool: + with self.conn as cur: + result = cur.execute( + """ + SELECT * + FROM blob + WHERE hex_hash = ? + """, + (hex_hash,) + ).fetchone() + + exists = result is not None + logger.debug("Checking blob existence: %s (%s)", hex_hash, exists) + return exists + def add_file( self, cur, diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py index ba841ae097..707ae27d86 100644 --- a/deepwell/importer/s3.py +++ b/deepwell/importer/s3.py @@ -22,7 +22,7 @@ def __init__(self, *, aws_profile, bucket, database) -> None: self.bucket = bucket self.database = database - def exists(self, s3_path: str) -> bool: + def s3_exists(self, s3_path: str) -> bool: try: self.s3_client.head_object( Bucket=self.s3_bucket, @@ -32,6 +32,12 @@ def exists(self, s3_path: str) -> bool: except: return False + def exists(self, hex_hash: str) -> bool: + s3_exists = self.s3_exists(hex_hash) + blob_exists = self.database.blob_exists(hex_hash) + assert s3_exists == blob_exists, "Mismatch between S3 blob and database table" + return s3_exists + def upload(self, file_path: str, mime: str) -> str: with open(file_path, "rb") as file: data = file.read() From 622e04ed85578f3d5f9b038cbfd1a20166d6398a Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 22:53:20 -0400 Subject: [PATCH 120/133] Only use database blob check. S3 has stuff from prior runs. --- deepwell/importer/s3.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py index 707ae27d86..52116d19b4 100644 --- a/deepwell/importer/s3.py +++ b/deepwell/importer/s3.py @@ -22,22 +22,6 @@ def __init__(self, *, aws_profile, bucket, database) -> None: self.bucket = bucket self.database = database - def s3_exists(self, s3_path: str) -> bool: - try: - self.s3_client.head_object( - Bucket=self.s3_bucket, - Key=s3_path, - ) - return True - except: - return False - - def exists(self, hex_hash: str) -> bool: - s3_exists = self.s3_exists(hex_hash) - blob_exists = self.database.blob_exists(hex_hash) - assert s3_exists == blob_exists, "Mismatch between S3 blob and database table" - return s3_exists - def upload(self, file_path: str, mime: str) -> str: with open(file_path, "rb") as file: data = file.read() @@ -46,7 +30,7 @@ def upload(self, file_path: str, mime: str) -> str: if not data: logger.debug("Skipping upload of empty S3 object") - elif self.exists(s3_path): + elif self.database.blob_exists(s3_path): logger.debug("S3 object %s already exists", s3_path) else: logger.info("Uploading S3 object %s (len %d)", s3_path, len(data)) From 77972a01b0ee5c30ae1b8bd371f9dafe3113c0f7 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 23:09:16 -0400 Subject: [PATCH 121/133] Add back magic, use it in case a file_metadata entry is missing. --- deepwell/importer/requirements.txt | 1 + deepwell/importer/site.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/deepwell/importer/requirements.txt b/deepwell/importer/requirements.txt index 46eed78c90..e526254bd6 100644 --- a/deepwell/importer/requirements.txt +++ b/deepwell/importer/requirements.txt @@ -1,3 +1,4 @@ boto3>=1.34.0 py7zr>=0.21.0 pycryptodome>=3.20.0 +python-magic>=0.4.0 diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 53f541e2d3..34a8b9fbd4 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -8,6 +8,7 @@ from urllib.parse import unquote as percent_unquote from urllib.request import urlopen +import magic import py7zr from .database import Database @@ -305,15 +306,21 @@ def process_files(self) -> None: for file_id_str, entry in mapping.items(): file_id = int(file_id_str) wikidot_url = entry["url"] - file_metadata = self.file_metadata[file_id] - logger.debug("Processing file stored at %s", wikidot_url) + logger.debug("Processing file stored at %s", wikidot_url) page_slug_url, filename = os.path.split(entry["path"]) page_slug = percent_unquote(page_slug_url) page_id = self.get_page_id(page_slug=page_slug) - path = os.path.join(self.file_dir, page_slug_url, file_id_str) - s3_hash = self.s3.upload(path, file_metadata["mime"]) + + try: + file_metadata = self.file_metadata[file_id] + mime = file_metadata["mime"] + except KeyError: + # No data, get MIME via libmagic + mime = magic.from_file(path, mime=True) + + s3_hash = self.s3.upload(path, mime) self.database.add_file( cur, From 4cfdfd788cf1892c9d89822db4e182f971bda936 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 23:37:51 -0400 Subject: [PATCH 122/133] Ignore un-downloaded files. I hate consistency issues >:( --- deepwell/importer/site.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 34a8b9fbd4..52776dfad9 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -313,6 +313,10 @@ def process_files(self) -> None: page_id = self.get_page_id(page_slug=page_slug) path = os.path.join(self.file_dir, page_slug_url, file_id_str) + if not os.path.isfile(path): + logger.error("File in map but not downloaded: %s (%d)", page_slug_url, file_id_str) + continue + try: file_metadata = self.file_metadata[file_id] mime = file_metadata["mime"] From 53adddc391b54b5ebaf8fee539761e0ce6fdc81b Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Thu, 4 Jul 2024 23:54:05 -0400 Subject: [PATCH 123/133] Emit commas for lengths. --- deepwell/importer/s3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/s3.py b/deepwell/importer/s3.py index 52116d19b4..0506d86586 100644 --- a/deepwell/importer/s3.py +++ b/deepwell/importer/s3.py @@ -33,7 +33,7 @@ def upload(self, file_path: str, mime: str) -> str: elif self.database.blob_exists(s3_path): logger.debug("S3 object %s already exists", s3_path) else: - logger.info("Uploading S3 object %s (len %d)", s3_path, len(data)) + logger.info("Uploading S3 object %s (len %s)", s3_path, f"{len(data):,}") self.client.put_object( Bucket=self.bucket, Key=s3_path, From fa60b1758540a83a2546c4719e42168e6871a6c0 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Fri, 5 Jul 2024 00:02:22 -0400 Subject: [PATCH 124/133] Consume missing page for file. If it's not in there, then the file must be leftover, from a deleted page. --- deepwell/importer/site.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 52776dfad9..b5f7329874 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -310,7 +310,13 @@ def process_files(self) -> None: logger.debug("Processing file stored at %s", wikidot_url) page_slug_url, filename = os.path.split(entry["path"]) page_slug = percent_unquote(page_slug_url) - page_id = self.get_page_id(page_slug=page_slug) + + try: + page_id = self.get_page_id(page_slug=page_slug) + except RuntimeError: + self.logger.error("Cannot find associated page with slug '%s'", page_slug) + continue + path = os.path.join(self.file_dir, page_slug_url, file_id_str) if not os.path.isfile(path): From 19823d36933d06fe29057bd8b363ed1795268bc6 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Fri, 5 Jul 2024 00:38:17 -0400 Subject: [PATCH 125/133] Fix logger call. --- deepwell/importer/site.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index b5f7329874..b14fc744bf 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -314,7 +314,7 @@ def process_files(self) -> None: try: page_id = self.get_page_id(page_slug=page_slug) except RuntimeError: - self.logger.error("Cannot find associated page with slug '%s'", page_slug) + logger.error("Cannot find associated page with slug '%s'", page_slug) continue path = os.path.join(self.file_dir, page_slug_url, file_id_str) From 3998b00ee3bf1b1889a33c8c6706261ab752867f Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Fri, 5 Jul 2024 20:47:25 -0400 Subject: [PATCH 126/133] Support Wikidot created by forum thread. --- deepwell/importer/seed.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/seed.sql b/deepwell/importer/seed.sql index 4901207d6c..bd6a82c2e8 100644 --- a/deepwell/importer/seed.sql +++ b/deepwell/importer/seed.sql @@ -105,7 +105,7 @@ CREATE TABLE forum_thread ( title TEXT NOT NULL, description TEXT NOT NULL, created_at INTEGER NOT NULL, - created_by INTEGER NOT NULL REFERENCES users(user_id), + created_by INTEGER REFERENCES users(user_id), -- NULL means wikidot post_count INTEGER NOT NULL, sticky INTEGER NOT NULL CHECK (sticky IN (0, 1)), -- boolean locked INTEGER NOT NULL CHECK (locked IN (0, 1)), -- boolean From 0500c16f1bcb43bf46da8bfe570136dc69614aaf Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Fri, 5 Jul 2024 20:59:44 -0400 Subject: [PATCH 127/133] Fix percent type. --- deepwell/importer/site.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index b14fc744bf..9184e4f0c5 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -320,7 +320,7 @@ def process_files(self) -> None: path = os.path.join(self.file_dir, page_slug_url, file_id_str) if not os.path.isfile(path): - logger.error("File in map but not downloaded: %s (%d)", page_slug_url, file_id_str) + logger.error("File in map but not downloaded: %s (%s)", page_slug_url, file_id_str) continue try: From f1d02b953a78f84f416962cdd9601a1265fe37af Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Fri, 5 Jul 2024 21:04:49 -0400 Subject: [PATCH 128/133] Run black formatter. --- deepwell/importer/database.py | 2 +- deepwell/importer/site.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index a062093412..6d6afbf42a 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -386,7 +386,7 @@ def blob_exists(self, hex_hash: str) -> bool: FROM blob WHERE hex_hash = ? """, - (hex_hash,) + (hex_hash,), ).fetchone() exists = result is not None diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 9184e4f0c5..1c0e744c20 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -314,13 +314,20 @@ def process_files(self) -> None: try: page_id = self.get_page_id(page_slug=page_slug) except RuntimeError: - logger.error("Cannot find associated page with slug '%s'", page_slug) + logger.error( + "Cannot find associated page with slug '%s'", + page_slug, + ) continue path = os.path.join(self.file_dir, page_slug_url, file_id_str) if not os.path.isfile(path): - logger.error("File in map but not downloaded: %s (%s)", page_slug_url, file_id_str) + logger.error( + "File in map but not downloaded: %s (%s)", + page_slug_url, + file_id_str, + ) continue try: From a4c9b6a6f4122b717fc447b87163a0b6bd0b52a2 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Fri, 5 Jul 2024 21:17:12 -0400 Subject: [PATCH 129/133] Add default for missing isLocked field. --- deepwell/importer/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 6d6afbf42a..9107a2455b 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -509,7 +509,7 @@ def add_forum_thread(self, cur, forum_category_id: int, metadata: dict) -> None: metadata["startedUser"], metadata["postsNum"], metadata["sticky"], - metadata["isLocked"], + metadata.get("isLocked", False), metadata.get("version"), ), ) From 59b13137a6f7fbd7f18e6b33700915ff1698254c Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 6 Jul 2024 00:48:44 -0400 Subject: [PATCH 130/133] Add another default False. --- deepwell/importer/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 9107a2455b..8e04a31f92 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -221,7 +221,7 @@ def add_page(self, cur, *, site_slug: str, page_descr: str, metadata: dict) -> N site_slug, sitemap_updated_at, metadata.get("title", ""), - metadata["is_locked"], + metadata.get("is_locked", False), json.dumps(metadata.get("tags", [])), ), ) From 2eac7f06ae8eaef97d69fa5030664c7538f56136 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Sat, 6 Jul 2024 23:53:35 -0400 Subject: [PATCH 131/133] Resolve conflict issue in add_forum_post_revision(). --- deepwell/importer/database.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index 8e04a31f92..edf2b4c9c5 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -571,6 +571,8 @@ def add_forum_post_revision(self, cur, post_id: int, metadata: dict) -> None: ) VALUES (?, ?, ?, ?, ?) + ON CONFLICT + DO NOTHING """, ( revision_id, From 1ad5d8b05e2c22171286bfec5986e466d24abfe5 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Tue, 9 Jul 2024 00:20:34 -0400 Subject: [PATCH 132/133] Add handling for anonymous revisions. --- deepwell/importer/database.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/database.py b/deepwell/importer/database.py index edf2b4c9c5..6e859bf59e 100644 --- a/deepwell/importer/database.py +++ b/deepwell/importer/database.py @@ -9,6 +9,8 @@ logger = logging.getLogger(__name__) +ANONYMOUS_USER_ID = 2 + class Database: __slots__ = ("conn",) @@ -303,7 +305,7 @@ def add_page_revision_metadata(self, cur, page_id: int, data: dict) -> None: data["global_revision"], data["revision"], page_id, - data["author"], + data["author"] or ANONYMOUS_USER_ID, data["stamp"], data["flags"], data["commentary"], From af85d709113f098d553a9b836ee7fc5c9a68e3a0 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Wed, 10 Jul 2024 01:23:05 -0400 Subject: [PATCH 133/133] Ignore missing pages when inserting wikitexts. --- deepwell/importer/site.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/deepwell/importer/site.py b/deepwell/importer/site.py index 1c0e744c20..8527f0c35d 100644 --- a/deepwell/importer/site.py +++ b/deepwell/importer/site.py @@ -270,7 +270,12 @@ def process_page_wikitext(self) -> None: ) continue - page_id = self.get_page_id(page_descr=page_descr) + try: + page_id = self.get_page_id(page_descr=page_descr) + except RuntimeError: + logger.error("No page descr '%s' found to insert wikitext", page_descr) + return + # Convert and begin adding to the database self.process_page_revisions_wikitext(page_id, sources)