Skip to content

Commit

Permalink
Parser Logging (#2665)
Browse files Browse the repository at this point in the history
* - Added custom formatter
- Added specific log config for parsing
- Added logging throughout parsing

* - Addding tty to docker
- Update colors

* - Added more logging based on discussion.

* - Small updates to logs/color

* - Fix lint errors

* - Added repr/str for datafile
- Updated repr/str for parsererror
- UPdated based on review comments

* - fix lint errors

* - Updated based on review comments

* - Updating buildpack

* Revert "- Updating buildpack"

This reverts commit 9854dcb.

* - extra logging message

* Revert "- extra logging message"

This reverts commit 8f19e95.

---------

Co-authored-by: Alex P <[email protected]>
  • Loading branch information
elipe17 and ADPennington authored Sep 12, 2023
1 parent d66ec43 commit 539007a
Show file tree
Hide file tree
Showing 10 changed files with 107 additions and 9 deletions.
1 change: 1 addition & 0 deletions tdrs-backend/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ services:
./gunicorn_start.sh && celery -A tdpservice.settings worker -l info"
ports:
- "5555:5555"
tty: true
depends_on:
- clamav-rest
- localstack
Expand Down
38 changes: 38 additions & 0 deletions tdrs-backend/tdpservice/core/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Contains core logging functionality for TDP."""

import logging

class ColorFormatter(logging.Formatter):
"""Simple formatter class to add color to log messages based on log level."""

BLACK = '\033[0;30m'
RED = '\033[0;31m'
GREEN = '\033[0;32m'
BROWN = '\033[0;33m'
BLUE = '\033[0;34m'
PURPLE = '\033[0;35m'
CYAN = '\033[0;36m'
GREY = '\033[0;37m'

DARK_GREY = '\033[1;30m'
LIGHT_RED = '\033[1;31m'
LIGHT_GREEN = '\033[1;32m'
YELLOW = '\033[1;33m'
LIGHT_BLUE = '\033[1;34m'
LIGHT_PURPLE = '\033[1;35m'
LIGHT_CYAN = '\033[1;36m'
WHITE = '\033[1;37m'

RESET = "\033[0m"

def __init__(self, *args, **kwargs):
self._colors = {logging.DEBUG: self.CYAN,
logging.INFO: self.GREEN,
logging.WARNING: self.YELLOW,
logging.ERROR: self.LIGHT_RED,
logging.CRITICAL: self.RED}
super(ColorFormatter, self).__init__(*args, **kwargs)

def format(self, record):
"""Format the record to be colored based on the log level."""
return self._colors.get(record.levelno, self.WHITE) + super().format(record) + self.RESET
9 changes: 9 additions & 0 deletions tdrs-backend/tdpservice/data_files/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,15 @@ def find_latest_version(self, year, quarter, section, stt):
version=version, year=year, quarter=quarter, section=section, stt=stt,
).first()

def __repr__(self):
"""Return a string representation of the model."""
return f"{{id: {self.id}, filename: {self.original_filename}, STT: {self.stt}, S3 location: " + \
f"{self.s3_location}}}"

def __str__(self):
"""Return a string representation of the model."""
return f"filename: {self.original_filename}"

class LegacyFileTransferManager(models.Manager):
"""Extends object manager functionality for LegacyFileTransfer model."""

Expand Down
4 changes: 4 additions & 0 deletions tdrs-backend/tdpservice/data_files/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ def create(self, request, *args, **kwargs):
data_file_id = response.data.get('id')
data_file = DataFile.objects.get(id=data_file_id)

logger.info(f"Preparing parse task: User META -> user: {request.user}, stt: {data_file.stt}. " +
f"Datafile META -> datafile: {data_file_id}, section: {data_file.section}, " +
f"quarter {data_file.quarter}, year {data_file.year}.")

parser_task.parse.delay(data_file_id)
logger.info("Submitted parse task to queue for datafile %s.", data_file_id)

Expand Down
9 changes: 9 additions & 0 deletions tdrs-backend/tdpservice/parsers/fields.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
"""Datafile field representations."""

import logging

logger = logging.getLogger(__name__)

def value_is_empty(value, length):
"""Handle 'empty' values as field inputs."""
empty_values = [
Expand Down Expand Up @@ -36,6 +40,7 @@ def parse_value(self, line):
value = line[self.startIndex:self.endIndex]

if value_is_empty(value, self.endIndex-self.startIndex):
logger.debug(f"Field: '{self.name}' at position: [{self.startIndex}, {self.endIndex}) is empty.")
return None

match self.type:
Expand All @@ -44,9 +49,13 @@ def parse_value(self, line):
value = int(value)
return value
except ValueError:
logger.error(f"Error parsing field value: {value} to integer.")
return None
case 'string':
return value
case _:
logger.warn(f"Unknown field type: {self.type}.")
return None

class TransformField(Field):
"""Represents a field that requires some transformation before serializing."""
Expand Down
5 changes: 3 additions & 2 deletions tdrs-backend/tdpservice/parsers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,12 @@ def rpt_month_name(self):

def __repr__(self):
"""Return a string representation of the model."""
return f"ParserError {self.id} for file {self.file} and object key {self.object_id}"
return f"{{id: {self.id}, file: {self.file.id}, row: {self.row_number}, column: {self.column_number}, " + \
f"error message: {self.error_message}}}"

def __str__(self):
"""Return a string representation of the model."""
return f"ParserError {self.id}"
return f"error_message: {self.error_message}"

def _get_error_message(self):
"""Return the error message."""
Expand Down
31 changes: 28 additions & 3 deletions tdrs-backend/tdpservice/parsers/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,18 @@ def parse_datafile(datafile):
util.make_generate_parser_error(datafile, 1)
)
if not header_is_valid:
logger.info(f"Preparser Error: {len(header_errors)} header errors encountered.")
errors['header'] = header_errors
bulk_create_errors({1: header_errors}, 1, flush=True)
return errors

is_encrypted = util.contains_encrypted_indicator(header_line, schema_defs.header.get_field_by_name("encryption"))
logger.debug(f"Datafile has encrypted fields: {is_encrypted}.")

# ensure file section matches upload section
program_type = header['program_type']
section = header['type']
logger.debug(f"Program type: {program_type}, Section: {section}.")

section_is_valid, section_error = validators.validate_header_section_matches_submission(
datafile,
Expand All @@ -40,6 +43,7 @@ def parse_datafile(datafile):
)

if not section_is_valid:
logger.info(f"Preparser Error -> Section is not valid: {section_error.error_message}")
errors['document'] = [section_error]
unsaved_parser_errors = {1: [section_error]}
bulk_create_errors(unsaved_parser_errors, 1, flush=True)
Expand All @@ -55,12 +59,17 @@ def parse_datafile(datafile):
def bulk_create_records(unsaved_records, line_number, header_count, batch_size=10000, flush=False):
"""Bulk create passed in records."""
if (line_number % batch_size == 0 and header_count > 0) or flush:
logger.debug("Bulk creating records.")
try:
num_created = 0
num_expected = 0
for model, records in unsaved_records.items():
num_expected += len(records)
num_created += len(model.objects.bulk_create(records))
if num_created != num_expected:
logger.error(f"Bulk create only created {num_created}/{num_expected}!")
else:
logger.info(f"Created {num_created}/{num_expected} records.")
return num_created == num_expected, {}
except DatabaseError as e:
logger.error(f"Encountered error while creating datafile records: {e}")
Expand All @@ -70,7 +79,10 @@ def bulk_create_records(unsaved_records, line_number, header_count, batch_size=1
def bulk_create_errors(unsaved_parser_errors, num_errors, batch_size=5000, flush=False):
"""Bulk create all ParserErrors."""
if flush or (unsaved_parser_errors and num_errors >= batch_size):
ParserError.objects.bulk_create(list(itertools.chain.from_iterable(unsaved_parser_errors.values())))
logger.debug("Bulk creating ParserErrors.")
num_created = len(ParserError.objects.bulk_create(list(itertools.chain.from_iterable(
unsaved_parser_errors.values()))))
logger.info(f"Created {num_created}/{num_errors} ParserErrors.")
return {}, 0
return unsaved_parser_errors, num_errors

Expand All @@ -94,12 +106,16 @@ def evaluate_trailer(datafile, trailer_count, multiple_trailer_errors, is_last_l

def rollback_records(unsaved_records, datafile):
"""Delete created records in the event of a failure."""
logger.info("Rolling back created records.")
for model in unsaved_records:
model.objects.filter(datafile=datafile).delete()
num_deleted, models = model.objects.filter(datafile=datafile).delete()
logger.debug(f"Deleted {num_deleted} records of type: {model}.")

def rollback_parser_errors(datafile):
"""Delete created errors in the event of a failure."""
ParserError.objects.filter(file=datafile).delete()
logger.info("Rolling back created parser errors.")
num_deleted, models = ParserError.objects.filter(file=datafile).delete()
logger.debug(f"Deleted {num_deleted} {ParserError}.")

def parse_datafile_lines(datafile, program_type, section, is_encrypted):
"""Parse lines with appropriate schema and return errors."""
Expand Down Expand Up @@ -135,13 +151,16 @@ def parse_datafile_lines(datafile, program_type, section, is_encrypted):
is_last, line, line_number)

if trailer_errors is not None:
logger.debug(f"{len(trailer_errors)} trailer error(s) detected for file " +
f"'{datafile.original_filename}' on line {line_number}.")
errors['trailer'] = trailer_errors
unsaved_parser_errors.update({"trailer": trailer_errors})
num_errors += len(trailer_errors)

generate_error = util.make_generate_parser_error(datafile, line_number)

if header_count > 1:
logger.info(f"Preparser Error -> Multiple headers found for file: {datafile.id} on line: {line_number}.")
errors.update({'document': ['Multiple headers found.']})
err_obj = generate_error(
schema=None,
Expand Down Expand Up @@ -173,6 +192,9 @@ def parse_datafile_lines(datafile, program_type, section, is_encrypted):
record_number += 1
record, record_is_valid, record_errors = r
if not record_is_valid:
logger.debug(f"Record #{i} from line {line_number} is invalid.")
line_errors = errors.get(f"{line_number}_{i}", {})
line_errors.update({record_number: record_errors})
errors.update({f"{line_number}_{i}": record_errors})
unsaved_parser_errors.update({f"{line_number}_{i}": record_errors})
num_errors += len(record_errors)
Expand All @@ -185,6 +207,7 @@ def parse_datafile_lines(datafile, program_type, section, is_encrypted):
unsaved_parser_errors, num_errors = bulk_create_errors(unsaved_parser_errors, num_errors)

if header_count == 0:
logger.info(f"Preparser Error -> No headers found for file: {datafile.id}.")
errors.update({'document': ['No headers found.']})
err_obj = generate_error(
schema=None,
Expand All @@ -203,6 +226,7 @@ def parse_datafile_lines(datafile, program_type, section, is_encrypted):
# successfully create the records.
all_created, unsaved_records = bulk_create_records(unsaved_records, line_number, header_count, flush=True)
if not all_created:
logger.error(f"Not all parsed records created for file: {datafile.id}!")
rollback_records(unsaved_records, datafile)
bulk_create_errors(unsaved_parser_errors, num_errors, flush=True)
return errors
Expand All @@ -218,6 +242,7 @@ def manager_parse_line(line, schema_manager, generate_error):
records = schema_manager.parse_and_validate(line, generate_error)
return records

logger.debug("Record Type is missing from record.")
return [(None, False, [
generate_error(
schema=None,
Expand Down
4 changes: 4 additions & 0 deletions tdrs-backend/tdpservice/parsers/row_schema.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
"""Row schema for datafile."""
from .models import ParserErrorCategoryChoices
from .fields import Field, value_is_empty
import logging

logger = logging.getLogger(__name__)


class RowSchema:
Expand Down Expand Up @@ -45,6 +48,7 @@ def parse_and_validate(self, line, generate_error):
if not preparsing_is_valid:
if self.quiet_preparser_errors:
return None, True, []
logger.info(f"{len(preparsing_errors)} preparser error(s) encountered.")
return None, False, preparsing_errors

# parse line to model
Expand Down
4 changes: 2 additions & 2 deletions tdrs-backend/tdpservice/scheduling/parser_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@ def parse(data_file_id):
# for undetermined amount of time.
data_file = DataFile.objects.get(id=data_file_id)

logger.info(f"DataFile parsing started for file {data_file.filename}")
logger.info(f"DataFile parsing started for file -> {repr(data_file)}")
errors = parse_datafile(data_file)
logger.info(f"DataFile parsing finished with {len(errors)} errors: {errors}")
logger.info(f"DataFile parsing finished with {len(errors)} errors, for file -> {repr(data_file)}.")
11 changes: 9 additions & 2 deletions tdrs-backend/tdpservice/settings/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,12 @@ class Common(Configuration):
},
"verbose": {
"format": (
"[%(asctime)s %(levelname)s %(filename)s::%(funcName)s:L%(lineno)d : %(message)s"
"%(asctime)s %(levelname)s %(filename)s::%(funcName)s:L%(lineno)d : %(message)s"
)
},
"simple": {"format": "%(levelname)s %(message)s"},
"color": {"()": "tdpservice.core.logger.ColorFormatter",
"format": "%(asctime)s %(levelname)s %(filename)s::%(funcName)s:L%(lineno)d : %(message)s"}
},
"filters": {"require_debug_true": {"()": "django.utils.log.RequireDebugTrue"}},
"handlers": {
Expand All @@ -224,7 +226,7 @@ class Common(Configuration):
},
"application": {
"class": "logging.StreamHandler",
"formatter": "verbose",
"formatter": "color",
},
},
"loggers": {
Expand All @@ -233,6 +235,11 @@ class Common(Configuration):
"propagate": True,
"level": LOGGING_LEVEL
},
"tdpservice.parsers": {
"handlers": ["application"],
"propagate": False,
"level": LOGGING_LEVEL
},
"django": {"handlers": ["console"], "propagate": True},
"django.server": {
"handlers": ["django.server"],
Expand Down

0 comments on commit 539007a

Please sign in to comment.