From 539007ae1b08d6acb97c74badeb6e5bf1d486c42 Mon Sep 17 00:00:00 2001 From: Eric Lipe <125676261+elipe17@users.noreply.github.com> Date: Tue, 12 Sep 2023 03:34:37 -0600 Subject: [PATCH] Parser Logging (#2665) * - Added custom formatter - Added specific log config for parsing - Added logging throughout parsing * - Addding tty to docker - Update colors * - Added more logging based on discussion. * - Small updates to logs/color * - Fix lint errors * - Added repr/str for datafile - Updated repr/str for parsererror - UPdated based on review comments * - fix lint errors * - Updated based on review comments * - Updating buildpack * Revert "- Updating buildpack" This reverts commit 9854dcb69461feacfc55e0cff672be42b0e9e1b2. * - extra logging message * Revert "- extra logging message" This reverts commit 8f19e952ddc2d76647c0da965881ba0c152fb416. --------- Co-authored-by: Alex P <63075587+ADPennington@users.noreply.github.com> --- tdrs-backend/docker-compose.yml | 1 + tdrs-backend/tdpservice/core/logger.py | 38 +++++++++++++++++++ tdrs-backend/tdpservice/data_files/models.py | 9 +++++ tdrs-backend/tdpservice/data_files/views.py | 4 ++ tdrs-backend/tdpservice/parsers/fields.py | 9 +++++ tdrs-backend/tdpservice/parsers/models.py | 5 ++- tdrs-backend/tdpservice/parsers/parse.py | 31 +++++++++++++-- tdrs-backend/tdpservice/parsers/row_schema.py | 4 ++ .../tdpservice/scheduling/parser_task.py | 4 +- tdrs-backend/tdpservice/settings/common.py | 11 +++++- 10 files changed, 107 insertions(+), 9 deletions(-) create mode 100644 tdrs-backend/tdpservice/core/logger.py diff --git a/tdrs-backend/docker-compose.yml b/tdrs-backend/docker-compose.yml index 9f41bf68e..d9d10d393 100644 --- a/tdrs-backend/docker-compose.yml +++ b/tdrs-backend/docker-compose.yml @@ -100,6 +100,7 @@ services: ./gunicorn_start.sh && celery -A tdpservice.settings worker -l info" ports: - "5555:5555" + tty: true depends_on: - clamav-rest - localstack diff --git a/tdrs-backend/tdpservice/core/logger.py b/tdrs-backend/tdpservice/core/logger.py new file mode 100644 index 000000000..1cd93c50a --- /dev/null +++ b/tdrs-backend/tdpservice/core/logger.py @@ -0,0 +1,38 @@ +"""Contains core logging functionality for TDP.""" + +import logging + +class ColorFormatter(logging.Formatter): + """Simple formatter class to add color to log messages based on log level.""" + + BLACK = '\033[0;30m' + RED = '\033[0;31m' + GREEN = '\033[0;32m' + BROWN = '\033[0;33m' + BLUE = '\033[0;34m' + PURPLE = '\033[0;35m' + CYAN = '\033[0;36m' + GREY = '\033[0;37m' + + DARK_GREY = '\033[1;30m' + LIGHT_RED = '\033[1;31m' + LIGHT_GREEN = '\033[1;32m' + YELLOW = '\033[1;33m' + LIGHT_BLUE = '\033[1;34m' + LIGHT_PURPLE = '\033[1;35m' + LIGHT_CYAN = '\033[1;36m' + WHITE = '\033[1;37m' + + RESET = "\033[0m" + + def __init__(self, *args, **kwargs): + self._colors = {logging.DEBUG: self.CYAN, + logging.INFO: self.GREEN, + logging.WARNING: self.YELLOW, + logging.ERROR: self.LIGHT_RED, + logging.CRITICAL: self.RED} + super(ColorFormatter, self).__init__(*args, **kwargs) + + def format(self, record): + """Format the record to be colored based on the log level.""" + return self._colors.get(record.levelno, self.WHITE) + super().format(record) + self.RESET diff --git a/tdrs-backend/tdpservice/data_files/models.py b/tdrs-backend/tdpservice/data_files/models.py index 9a954122c..b4248a9cd 100644 --- a/tdrs-backend/tdpservice/data_files/models.py +++ b/tdrs-backend/tdpservice/data_files/models.py @@ -221,6 +221,15 @@ def find_latest_version(self, year, quarter, section, stt): version=version, year=year, quarter=quarter, section=section, stt=stt, ).first() + def __repr__(self): + """Return a string representation of the model.""" + return f"{{id: {self.id}, filename: {self.original_filename}, STT: {self.stt}, S3 location: " + \ + f"{self.s3_location}}}" + + def __str__(self): + """Return a string representation of the model.""" + return f"filename: {self.original_filename}" + class LegacyFileTransferManager(models.Manager): """Extends object manager functionality for LegacyFileTransfer model.""" diff --git a/tdrs-backend/tdpservice/data_files/views.py b/tdrs-backend/tdpservice/data_files/views.py index 305985512..f01bb5be6 100644 --- a/tdrs-backend/tdpservice/data_files/views.py +++ b/tdrs-backend/tdpservice/data_files/views.py @@ -66,6 +66,10 @@ def create(self, request, *args, **kwargs): data_file_id = response.data.get('id') data_file = DataFile.objects.get(id=data_file_id) + logger.info(f"Preparing parse task: User META -> user: {request.user}, stt: {data_file.stt}. " + + f"Datafile META -> datafile: {data_file_id}, section: {data_file.section}, " + + f"quarter {data_file.quarter}, year {data_file.year}.") + parser_task.parse.delay(data_file_id) logger.info("Submitted parse task to queue for datafile %s.", data_file_id) diff --git a/tdrs-backend/tdpservice/parsers/fields.py b/tdrs-backend/tdpservice/parsers/fields.py index 6fb742031..fa4c73101 100644 --- a/tdrs-backend/tdpservice/parsers/fields.py +++ b/tdrs-backend/tdpservice/parsers/fields.py @@ -1,5 +1,9 @@ """Datafile field representations.""" +import logging + +logger = logging.getLogger(__name__) + def value_is_empty(value, length): """Handle 'empty' values as field inputs.""" empty_values = [ @@ -36,6 +40,7 @@ def parse_value(self, line): value = line[self.startIndex:self.endIndex] if value_is_empty(value, self.endIndex-self.startIndex): + logger.debug(f"Field: '{self.name}' at position: [{self.startIndex}, {self.endIndex}) is empty.") return None match self.type: @@ -44,9 +49,13 @@ def parse_value(self, line): value = int(value) return value except ValueError: + logger.error(f"Error parsing field value: {value} to integer.") return None case 'string': return value + case _: + logger.warn(f"Unknown field type: {self.type}.") + return None class TransformField(Field): """Represents a field that requires some transformation before serializing.""" diff --git a/tdrs-backend/tdpservice/parsers/models.py b/tdrs-backend/tdpservice/parsers/models.py index da95f64e9..4a638e06a 100644 --- a/tdrs-backend/tdpservice/parsers/models.py +++ b/tdrs-backend/tdpservice/parsers/models.py @@ -57,11 +57,12 @@ def rpt_month_name(self): def __repr__(self): """Return a string representation of the model.""" - return f"ParserError {self.id} for file {self.file} and object key {self.object_id}" + return f"{{id: {self.id}, file: {self.file.id}, row: {self.row_number}, column: {self.column_number}, " + \ + f"error message: {self.error_message}}}" def __str__(self): """Return a string representation of the model.""" - return f"ParserError {self.id}" + return f"error_message: {self.error_message}" def _get_error_message(self): """Return the error message.""" diff --git a/tdrs-backend/tdpservice/parsers/parse.py b/tdrs-backend/tdpservice/parsers/parse.py index 84a64f6e1..2c2183c68 100644 --- a/tdrs-backend/tdpservice/parsers/parse.py +++ b/tdrs-backend/tdpservice/parsers/parse.py @@ -23,15 +23,18 @@ def parse_datafile(datafile): util.make_generate_parser_error(datafile, 1) ) if not header_is_valid: + logger.info(f"Preparser Error: {len(header_errors)} header errors encountered.") errors['header'] = header_errors bulk_create_errors({1: header_errors}, 1, flush=True) return errors is_encrypted = util.contains_encrypted_indicator(header_line, schema_defs.header.get_field_by_name("encryption")) + logger.debug(f"Datafile has encrypted fields: {is_encrypted}.") # ensure file section matches upload section program_type = header['program_type'] section = header['type'] + logger.debug(f"Program type: {program_type}, Section: {section}.") section_is_valid, section_error = validators.validate_header_section_matches_submission( datafile, @@ -40,6 +43,7 @@ def parse_datafile(datafile): ) if not section_is_valid: + logger.info(f"Preparser Error -> Section is not valid: {section_error.error_message}") errors['document'] = [section_error] unsaved_parser_errors = {1: [section_error]} bulk_create_errors(unsaved_parser_errors, 1, flush=True) @@ -55,12 +59,17 @@ def parse_datafile(datafile): def bulk_create_records(unsaved_records, line_number, header_count, batch_size=10000, flush=False): """Bulk create passed in records.""" if (line_number % batch_size == 0 and header_count > 0) or flush: + logger.debug("Bulk creating records.") try: num_created = 0 num_expected = 0 for model, records in unsaved_records.items(): num_expected += len(records) num_created += len(model.objects.bulk_create(records)) + if num_created != num_expected: + logger.error(f"Bulk create only created {num_created}/{num_expected}!") + else: + logger.info(f"Created {num_created}/{num_expected} records.") return num_created == num_expected, {} except DatabaseError as e: logger.error(f"Encountered error while creating datafile records: {e}") @@ -70,7 +79,10 @@ def bulk_create_records(unsaved_records, line_number, header_count, batch_size=1 def bulk_create_errors(unsaved_parser_errors, num_errors, batch_size=5000, flush=False): """Bulk create all ParserErrors.""" if flush or (unsaved_parser_errors and num_errors >= batch_size): - ParserError.objects.bulk_create(list(itertools.chain.from_iterable(unsaved_parser_errors.values()))) + logger.debug("Bulk creating ParserErrors.") + num_created = len(ParserError.objects.bulk_create(list(itertools.chain.from_iterable( + unsaved_parser_errors.values())))) + logger.info(f"Created {num_created}/{num_errors} ParserErrors.") return {}, 0 return unsaved_parser_errors, num_errors @@ -94,12 +106,16 @@ def evaluate_trailer(datafile, trailer_count, multiple_trailer_errors, is_last_l def rollback_records(unsaved_records, datafile): """Delete created records in the event of a failure.""" + logger.info("Rolling back created records.") for model in unsaved_records: - model.objects.filter(datafile=datafile).delete() + num_deleted, models = model.objects.filter(datafile=datafile).delete() + logger.debug(f"Deleted {num_deleted} records of type: {model}.") def rollback_parser_errors(datafile): """Delete created errors in the event of a failure.""" - ParserError.objects.filter(file=datafile).delete() + logger.info("Rolling back created parser errors.") + num_deleted, models = ParserError.objects.filter(file=datafile).delete() + logger.debug(f"Deleted {num_deleted} {ParserError}.") def parse_datafile_lines(datafile, program_type, section, is_encrypted): """Parse lines with appropriate schema and return errors.""" @@ -135,6 +151,8 @@ def parse_datafile_lines(datafile, program_type, section, is_encrypted): is_last, line, line_number) if trailer_errors is not None: + logger.debug(f"{len(trailer_errors)} trailer error(s) detected for file " + + f"'{datafile.original_filename}' on line {line_number}.") errors['trailer'] = trailer_errors unsaved_parser_errors.update({"trailer": trailer_errors}) num_errors += len(trailer_errors) @@ -142,6 +160,7 @@ def parse_datafile_lines(datafile, program_type, section, is_encrypted): generate_error = util.make_generate_parser_error(datafile, line_number) if header_count > 1: + logger.info(f"Preparser Error -> Multiple headers found for file: {datafile.id} on line: {line_number}.") errors.update({'document': ['Multiple headers found.']}) err_obj = generate_error( schema=None, @@ -173,6 +192,9 @@ def parse_datafile_lines(datafile, program_type, section, is_encrypted): record_number += 1 record, record_is_valid, record_errors = r if not record_is_valid: + logger.debug(f"Record #{i} from line {line_number} is invalid.") + line_errors = errors.get(f"{line_number}_{i}", {}) + line_errors.update({record_number: record_errors}) errors.update({f"{line_number}_{i}": record_errors}) unsaved_parser_errors.update({f"{line_number}_{i}": record_errors}) num_errors += len(record_errors) @@ -185,6 +207,7 @@ def parse_datafile_lines(datafile, program_type, section, is_encrypted): unsaved_parser_errors, num_errors = bulk_create_errors(unsaved_parser_errors, num_errors) if header_count == 0: + logger.info(f"Preparser Error -> No headers found for file: {datafile.id}.") errors.update({'document': ['No headers found.']}) err_obj = generate_error( schema=None, @@ -203,6 +226,7 @@ def parse_datafile_lines(datafile, program_type, section, is_encrypted): # successfully create the records. all_created, unsaved_records = bulk_create_records(unsaved_records, line_number, header_count, flush=True) if not all_created: + logger.error(f"Not all parsed records created for file: {datafile.id}!") rollback_records(unsaved_records, datafile) bulk_create_errors(unsaved_parser_errors, num_errors, flush=True) return errors @@ -218,6 +242,7 @@ def manager_parse_line(line, schema_manager, generate_error): records = schema_manager.parse_and_validate(line, generate_error) return records + logger.debug("Record Type is missing from record.") return [(None, False, [ generate_error( schema=None, diff --git a/tdrs-backend/tdpservice/parsers/row_schema.py b/tdrs-backend/tdpservice/parsers/row_schema.py index a15b1a6bc..a4faecdf3 100644 --- a/tdrs-backend/tdpservice/parsers/row_schema.py +++ b/tdrs-backend/tdpservice/parsers/row_schema.py @@ -1,6 +1,9 @@ """Row schema for datafile.""" from .models import ParserErrorCategoryChoices from .fields import Field, value_is_empty +import logging + +logger = logging.getLogger(__name__) class RowSchema: @@ -45,6 +48,7 @@ def parse_and_validate(self, line, generate_error): if not preparsing_is_valid: if self.quiet_preparser_errors: return None, True, [] + logger.info(f"{len(preparsing_errors)} preparser error(s) encountered.") return None, False, preparsing_errors # parse line to model diff --git a/tdrs-backend/tdpservice/scheduling/parser_task.py b/tdrs-backend/tdpservice/scheduling/parser_task.py index 649ae0723..4ffd91277 100644 --- a/tdrs-backend/tdpservice/scheduling/parser_task.py +++ b/tdrs-backend/tdpservice/scheduling/parser_task.py @@ -16,6 +16,6 @@ def parse(data_file_id): # for undetermined amount of time. data_file = DataFile.objects.get(id=data_file_id) - logger.info(f"DataFile parsing started for file {data_file.filename}") + logger.info(f"DataFile parsing started for file -> {repr(data_file)}") errors = parse_datafile(data_file) - logger.info(f"DataFile parsing finished with {len(errors)} errors: {errors}") + logger.info(f"DataFile parsing finished with {len(errors)} errors, for file -> {repr(data_file)}.") diff --git a/tdrs-backend/tdpservice/settings/common.py b/tdrs-backend/tdpservice/settings/common.py index cae90db27..e69a0b8c8 100644 --- a/tdrs-backend/tdpservice/settings/common.py +++ b/tdrs-backend/tdpservice/settings/common.py @@ -205,10 +205,12 @@ class Common(Configuration): }, "verbose": { "format": ( - "[%(asctime)s %(levelname)s %(filename)s::%(funcName)s:L%(lineno)d : %(message)s" + "%(asctime)s %(levelname)s %(filename)s::%(funcName)s:L%(lineno)d : %(message)s" ) }, "simple": {"format": "%(levelname)s %(message)s"}, + "color": {"()": "tdpservice.core.logger.ColorFormatter", + "format": "%(asctime)s %(levelname)s %(filename)s::%(funcName)s:L%(lineno)d : %(message)s"} }, "filters": {"require_debug_true": {"()": "django.utils.log.RequireDebugTrue"}}, "handlers": { @@ -224,7 +226,7 @@ class Common(Configuration): }, "application": { "class": "logging.StreamHandler", - "formatter": "verbose", + "formatter": "color", }, }, "loggers": { @@ -233,6 +235,11 @@ class Common(Configuration): "propagate": True, "level": LOGGING_LEVEL }, + "tdpservice.parsers": { + "handlers": ["application"], + "propagate": False, + "level": LOGGING_LEVEL + }, "django": {"handlers": ["console"], "propagate": True}, "django.server": { "handlers": ["django.server"],