Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FCL-309] Add NCN to identifiers on ingest #231

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MARKLOGIC_HOST=host.docker.internal
MARKLOGIC_USER=
MARKLOGIC_PASSWORD=
MARKLOGIC_USER=admin
MARKLOGIC_PASSWORD=admin
XSLT_IMAGE_LOCATION=
AWS_ACCESS_KEY_ID=123
AWS_SECRET_KEY=xyz
Expand Down
48 changes: 36 additions & 12 deletions ds-caselaw-ingester/lambda_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import os
import tarfile
import xml.etree.ElementTree as ET
from typing import Dict, List, Tuple
from urllib.parse import unquote_plus
from xml.sax.saxutils import escape
from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber
from caselawclient.models.documents import DocumentURIString

import boto3
import rollbar
Expand All @@ -18,6 +19,11 @@
from caselawclient.client_helpers import VersionAnnotation, VersionType
from dotenv import load_dotenv
from notifications_python_client.notifications import NotificationsAPIClient
import logging
from caselawclient.models.documents import Document

logger = logging.getLogger("ingester")
logger.setLevel(logging.DEBUG)

load_dotenv()
rollbar.init(os.getenv("ROLLBAR_TOKEN"), environment=os.getenv("ROLLBAR_ENV"))
Expand All @@ -37,7 +43,7 @@ def __init__(self, metadata):
self.parameters = metadata.get("parameters", {})

@property
def is_tdr(self):
def is_tdr(self) -> bool:
return "TDR" in self.parameters.keys()

@property
Expand Down Expand Up @@ -86,7 +92,7 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

@property
def originator(self):
def originator(self) -> str:
return self.message.get("parameters", {}).get("originator")

def get_consignment_reference(self):
Expand All @@ -100,7 +106,7 @@ def get_consignment_reference(self):

raise InvalidMessageException("Malformed v2 message, please supply a reference")

def save_s3_response(self, sqs_client, s3_client):
def save_s3_response(self, sqs_client, s3_client) -> str:
s3_bucket = self.message.get("parameters", {}).get("s3Bucket")
s3_key = self.message.get("parameters", {}).get("s3Key")
reference = self.get_consignment_reference()
Expand Down Expand Up @@ -195,7 +201,7 @@ def modify_filename(original: str, addition: str) -> str:
return os.path.join(path, new_basename)


def all_messages(event) -> List[Message]:
def all_messages(event) -> list[Message]:
"""All the messages in the SNS event, as Message subclasses"""
decoder = json.decoder.JSONDecoder()
messages_as_decoded_json = [decoder.decode(record["Sns"]["Message"]) for record in event["Records"]]
Expand Down Expand Up @@ -249,7 +255,7 @@ def extract_docx_filename(metadata: dict, consignment_reference: str) -> str:
)


def extract_lambda_versions(versions: List[Dict[str, str]]) -> List[Tuple[str, str]]:
def extract_lambda_versions(versions: list[dict[str, str]]) -> list[tuple[str, str]]:
version_tuples = []
for d in versions:
version_tuples += list(d.items())
Expand Down Expand Up @@ -437,6 +443,23 @@ def insert_document_xml(self) -> bool:
api_client.insert_document_xml(self.uri, self.xml, annotation)
return True

def set_document_identifiers(self) -> None:
logging.critical("start set_document_identifiers")
doc = api_client.get_document_by_uri(DocumentURIString(self.uri))
if doc.identifiers:
msg = f"Ingesting, but identifiers already present for {self.uri}!"
logger.warning(msg)

try:
ncn = doc.neutral_citation
except AttributeError:
return

if ncn:
doc.identifiers.add(NeutralCitationNumber(ncn))
doc.identifiers.save(doc)
logging.info(f"Ingested document had NCN {ncn}")

def send_updated_judgment_notification(self) -> None:
personalisation = personalise_email(self.uri, self.metadata)
if os.getenv("ROLLBAR_ENV") != "prod":
Expand Down Expand Up @@ -501,7 +524,7 @@ def store_metadata(self) -> None:
value=tdr_metadata["Consignment-Completed-Datetime"],
)

def save_files_to_s3(self):
def save_files_to_s3(self) -> None:
sqs_client, s3_client = aws_clients()
# Determine if there's a word document -- we need to know before we save the tar.gz file
docx_filename = extract_docx_filename(self.metadata, self.consignment_reference)
Expand Down Expand Up @@ -555,7 +578,7 @@ def save_files_to_s3(self):
)

@property
def metadata_object(self):
def metadata_object(self) -> Metadata:
return Metadata(self.metadata)

def will_publish(self) -> bool:
Expand All @@ -574,7 +597,7 @@ def will_publish(self) -> bool:

raise RuntimeError(f"Didn't recognise originator {originator!r}")

def send_email(self):
def send_email(self) -> None:
originator = self.message.originator
if originator == "FCL":
return None
Expand All @@ -587,19 +610,20 @@ def send_email(self):

raise RuntimeError(f"Didn't recognise originator {originator!r}")

def close_tar(self):
def close_tar(self) -> None:
self.tar.close()

def upload_xml(self):
def upload_xml(self) -> None:
self.updated = self.update_document_xml()
self.inserted = False if self.updated else self.insert_document_xml()
if not self.updated and not self.inserted:
raise DocumentInsertionError(
f"Judgment {self.uri} failed to insert into Marklogic. Consignment Ref: {self.consignment_reference}"
)
self.set_document_identifiers()

@property
def upload_state(self):
def upload_state(self) -> str:
return "updated" if self.updated else "inserted"


Expand Down
2 changes: 1 addition & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
django-environ~=0.10
ds-caselaw-marklogic-api-client==27.4.0
ds-caselaw-marklogic-api-client==29.0.0a2
requests-toolbelt~=1.0
urllib3~=2.2
boto3
Expand Down
Loading