diff --git a/app.py b/app.py index 5251e6b..ac00662 100644 --- a/app.py +++ b/app.py @@ -1,30 +1,43 @@ -""" Module for Table Reader main function and argparse """ +""" +Table Reader - A command-line tool for processing tables from images and +outputting data frames. This script defines the CLI interface, processes images, +and applies OCR with custom options like whitelist, blacklist, and character corrections. +""" import argparse import os import logging import cv2 -from cli.read_and_process import convert_to_grayscale, \ - normalize_images, perform_ocr, process_text, pad_columns, \ - remove_special_characters, create_dataframe, save_dataframe_to_directory - +from cli.read_and_process import ( + convert_to_grayscale, + normalize_images, + perform_ocr, + process_text, + pad_columns, + remove_special_characters, + create_dataframe, + save_dataframe_to_directory, + correct_common_ocr_mistakes +) import config def setup_logging(): """ - Set up logging configuration. + Set up logging configuration to output logs to the console. """ logging.basicConfig( - level=logging.INFO, # Change to DEBUG for more detailed output + level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[ - logging.StreamHandler() # Log to console - ] + handlers=[logging.StreamHandler()] ) def parse_arguments(): """ - Parse command-line arguments for the Table Reader application. + Parse command-line arguments for Table Reader, providing options + for input/output directories, whitelist, blacklist, and character corrections. + + Returns: + argparse.Namespace: Parsed command-line arguments. """ parser = argparse.ArgumentParser( description=( @@ -34,40 +47,64 @@ def parse_arguments(): formatter_class=argparse.ArgumentDefaultsHelpFormatter ) + # Input and output directories parser.add_argument( "-i", "--input-dir", help="Path to the input directory containing images to process.", - default=config.DEFAULT_INPUT_DIR, - required=True + default=None, + required=False ) parser.add_argument( "-o", "--output-dir", help="Path to the output directory where processed data frames will be saved as CSV files.", - default=config.DEFAULT_OUTPUT_DIR, - required=True + default=None, + required=False ) - # New arguments + # Whitelist and blacklist arguments parser.add_argument( "-w", "--whitelist", - help="Characters to whitelist (preserve) in OCR output.", - default="", # No whitelist by default + help=( + 'Restrict the OCR output to only include certain characters. ' + 'For example, to only allow numeric characters and common punctuation: ' + '"0123456789.,".' + ), + default="", ) parser.add_argument( "-b", "--blacklist", - help="Characters to remove from OCR output.", - default="", # No blacklist by default + help=( + 'Remove specific characters from the OCR output. For example, to remove ' + 'common punctuation marks like commas and periods: ",."' + ), + default="", ) + # Custom character corrections argument parser.add_argument( "-c", "--char-corrections", - help="Custom character corrections for misread letters, numbers, or combinations of - e.g., 'S1:51, S2:52'.", - default="", # No custom corrections by default + help="Custom character corrections in the format: wrong:correct, e.g., 'S1:51,S2:52'.", + default="", + ) + + # Argument to show default corrections + parser.add_argument( + "-d", "--show-default-corrections", + action="store_true", + help="Show the default character corrections used by Table Reader." ) args = parser.parse_args() + + # Conditionally require input-dir and output-dir unless showing default corrections + if not args.show_default_corrections: + if args.input_dir is None: + parser.error("--input-dir is required unless --show-default-corrections is specified.") + if args.output_dir is None: + parser.error("--output-dir is required unless --show-default-corrections is specified.") + return args def parse_char_corrections(correction_string): @@ -89,25 +126,47 @@ def parse_char_corrections(correction_string): corrections[key] = value return corrections -def main(): +def show_default_corrections(): """ - Process multiple images and save output data frames to CSV files. - - Parses command-line arguments to set input and output directories. - Reads images from the input directory, processes each image, and - saves the extracted data to separate CSV files in the output directory. - - Args: - None (Uses command-line arguments for input and output directories) - - Returns: - None + Display the default corrections used by the Table Reader. """ + default_corrections = { + ',': ',', # Ensure decimal comma is preserved + '--': 'โ€”', # Convert double hyphen to em dash + '-': '-', # Ensure standard hyphen is preserved + '1o': '10', '2o': '20', '3o': '30', '4o': '40', '5o': '50', + '6o': '60', '7o': '70', '8o': '80', '9o': '90', '0o': '00', + 'S1': '51', 'S2': '52', 'S3': '53', 'S4': '54', 'S5': '55', + 'S6': '56', 'S7': '57', 'S8': '58', 'S9': '59', 'S0': '50', + 'ยท': '.', # Ensure mid-dot conversion to decimal + 'โ€“': 'โ€”', # Ensure en dash is preserved or converted to em dash + } + + print("By default, Table Reader includes these OCR Corrections:") + for wrong, correct in default_corrections.items(): + print(f" '{wrong}' -> '{correct}'") + print( + "You can provide your own custom character corrections to Table Reader " + "using the --char-corrections flag in the format: wrong:correct." + ) +def main(): + """ + Main function to process images and output data frames. + It handles image preprocessing, OCR, and saving the final CSV output. + """ setup_logging() args = parse_arguments() - # Set input and output directories in config module + # If the user requests to show default corrections, display them and exit + if args.show_default_corrections: + show_default_corrections() + return + + # Parse custom character corrections + custom_corrections = parse_char_corrections(args.char_corrections) + + # Set input and output directories in the config module config.set_input_directory(args.input_dir) config.set_output_directory(args.output_dir) @@ -116,8 +175,11 @@ def main(): logging.error("Input directory %s does not exist.", args.input_dir) return - # Read all images from the input directory - image_files = [f for f in os.listdir(args.input_dir) if f.endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'))] + # Read all valid images from the input directory + image_files = [ + f for f in os.listdir(args.input_dir) + if f.endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff')) + ] if not image_files: logging.warning("No valid image files found in the input directory: %s", args.input_dir) @@ -129,7 +191,7 @@ def main(): logging.info("Processing image: %s", image_path) try: - # Read the image directly using OpenCV + # Read the image using OpenCV image = cv2.imread(image_path) if image is None: logging.warning("Unable to read image: %s", image_path) @@ -139,9 +201,16 @@ def main(): grayscale_image = convert_to_grayscale([image])[0] normalized_image = normalize_images([grayscale_image])[0] extracted_text = perform_ocr([normalized_image])[0] - processed_text = process_text([extracted_text]) + + # Apply user-defined whitelist, blacklist, and corrections + corrected_text = correct_common_ocr_mistakes(extracted_text, custom_corrections) + processed_text = process_text([corrected_text]) padded_columns = pad_columns(processed_text) - clean_data = remove_special_characters(padded_columns) + clean_data = remove_special_characters( + padded_columns, + whitelist=args.whitelist, + blacklist=args.blacklist + ) df = create_dataframe(clean_data) # Save DataFrame to output directory with a unique filename