update --help descriptions

JasonGellis · Sep 6, 2024 · 6942584 · 6942584
1 parent 7274779
commit 6942584
Showing 1 changed file with 109 additions and 40 deletions.
diff --git a/app.py b/app.py
@@ -1,30 +1,43 @@
-""" Module for Table Reader main function and argparse """
+"""
+Table Reader - A command-line tool for processing tables from images and
+outputting data frames. This script defines the CLI interface, processes images,
+and applies OCR with custom options like whitelist, blacklist, and character corrections.
+"""
 
 import argparse
 import os
 import logging
 import cv2
-from cli.read_and_process import convert_to_grayscale, \
-    normalize_images, perform_ocr, process_text, pad_columns, \
-    remove_special_characters, create_dataframe, save_dataframe_to_directory
-
+from cli.read_and_process import (
+    convert_to_grayscale,
+    normalize_images,
+    perform_ocr,
+    process_text,
+    pad_columns,
+    remove_special_characters,
+    create_dataframe,
+    save_dataframe_to_directory,
+    correct_common_ocr_mistakes
+)
 import config
 
 def setup_logging():
     """
-    Set up logging configuration.
+    Set up logging configuration to output logs to the console.
     """
     logging.basicConfig(
-        level=logging.INFO,  # Change to DEBUG for more detailed output
+        level=logging.INFO,
         format='%(asctime)s - %(levelname)s - %(message)s',
-        handlers=[
-            logging.StreamHandler()  # Log to console
-        ]
+        handlers=[logging.StreamHandler()]
     )
 
 def parse_arguments():
     """
-    Parse command-line arguments for the Table Reader application.
+    Parse command-line arguments for Table Reader, providing options
+    for input/output directories, whitelist, blacklist, and character corrections.
+
+    Returns:
+        argparse.Namespace: Parsed command-line arguments.
     """
     parser = argparse.ArgumentParser(
         description=(
@@ -34,40 +47,64 @@ def parse_arguments():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
 
+    # Input and output directories
     parser.add_argument(
         "-i", "--input-dir",
         help="Path to the input directory containing images to process.",
-        default=config.DEFAULT_INPUT_DIR,
-        required=True
+        default=None,
+        required=False
     )
 
     parser.add_argument(
         "-o", "--output-dir",
         help="Path to the output directory where processed data frames will be saved as CSV files.",
-        default=config.DEFAULT_OUTPUT_DIR,
-        required=True
+        default=None,
+        required=False
     )
 
-    # New arguments
+    # Whitelist and blacklist arguments
     parser.add_argument(
         "-w", "--whitelist",
-        help="Characters to whitelist (preserve) in OCR output.",
-        default="",  # No whitelist by default
+        help=(
+            'Restrict the OCR output to only include certain characters. '
+            'For example, to only allow numeric characters and common punctuation: '
+            '"0123456789.,".'
+        ),
+        default="",
     )
 
     parser.add_argument(
         "-b", "--blacklist",
-        help="Characters to remove from OCR output.",
-        default="",  # No blacklist by default
+        help=(
+            'Remove specific characters from the OCR output. For example, to remove '
+            'common punctuation marks like commas and periods: ",."'
+        ),
+        default="",
     )
 
+    # Custom character corrections argument
     parser.add_argument(
         "-c", "--char-corrections",
-        help="Custom character corrections for misread letters, numbers, or combinations of - e.g., 'S1:51, S2:52'.",
-        default="",  # No custom corrections by default
+        help="Custom character corrections in the format: wrong:correct, e.g., 'S1:51,S2:52'.",
+        default="",
+    )
+
+    # Argument to show default corrections
+    parser.add_argument(
+        "-d", "--show-default-corrections",
+        action="store_true",
+        help="Show the default character corrections used by Table Reader."
     )
 
     args = parser.parse_args()
+
+    # Conditionally require input-dir and output-dir unless showing default corrections
+    if not args.show_default_corrections:
+        if args.input_dir is None:
+            parser.error("--input-dir is required unless --show-default-corrections is specified.")
+        if args.output_dir is None:
+            parser.error("--output-dir is required unless --show-default-corrections is specified.")
+
     return args
 
 def parse_char_corrections(correction_string):
@@ -89,25 +126,47 @@ def parse_char_corrections(correction_string):
                 corrections[key] = value
     return corrections
 
-def main():
+def show_default_corrections():
     """
-    Process multiple images and save output data frames to CSV files.
-
-    Parses command-line arguments to set input and output directories.
-    Reads images from the input directory, processes each image, and
-    saves the extracted data to separate CSV files in the output directory.
-
-    Args:
-        None (Uses command-line arguments for input and output directories)
-
-    Returns:
-        None
+    Display the default corrections used by the Table Reader.
     """
+    default_corrections = {
+        ',': ',',  # Ensure decimal comma is preserved
+        '--': '—',  # Convert double hyphen to em dash
+        '-': '-',  # Ensure standard hyphen is preserved
+        '1o': '10', '2o': '20', '3o': '30', '4o': '40', '5o': '50',
+        '6o': '60', '7o': '70', '8o': '80', '9o': '90', '0o': '00',
+        'S1': '51', 'S2': '52', 'S3': '53', 'S4': '54', 'S5': '55',
+        'S6': '56', 'S7': '57', 'S8': '58', 'S9': '59', 'S0': '50',
+        '·': '.',  # Ensure mid-dot conversion to decimal
+        '–': '—',  # Ensure en dash is preserved or converted to em dash
+    }
+
+    print("By default, Table Reader includes these OCR Corrections:")
+    for wrong, correct in default_corrections.items():
+        print(f"  '{wrong}' -> '{correct}'")
+    print(
+        "You can provide your own custom character corrections to Table Reader "
+        "using the --char-corrections flag in the format: wrong:correct."
+    )
 
+def main():
+    """
+    Main function to process images and output data frames.
+    It handles image preprocessing, OCR, and saving the final CSV output.
+    """
     setup_logging()
     args = parse_arguments()
 
-    # Set input and output directories in config module
+    # If the user requests to show default corrections, display them and exit
+    if args.show_default_corrections:
+        show_default_corrections()
+        return
+
+    # Parse custom character corrections
+    custom_corrections = parse_char_corrections(args.char_corrections)
+
+    # Set input and output directories in the config module
     config.set_input_directory(args.input_dir)
     config.set_output_directory(args.output_dir)
 
@@ -116,8 +175,11 @@ def main():
         logging.error("Input directory %s does not exist.", args.input_dir)
         return
 
-    # Read all images from the input directory
-    image_files = [f for f in os.listdir(args.input_dir) if f.endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'))]
+    # Read all valid images from the input directory
+    image_files = [
+        f for f in os.listdir(args.input_dir)
+        if f.endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'))
+    ]
 
     if not image_files:
         logging.warning("No valid image files found in the input directory: %s", args.input_dir)
@@ -129,7 +191,7 @@ def main():
         logging.info("Processing image: %s", image_path)
 
         try:
-            # Read the image directly using OpenCV
+            # Read the image using OpenCV
             image = cv2.imread(image_path)
             if image is None:
                 logging.warning("Unable to read image: %s", image_path)
@@ -139,9 +201,16 @@ def main():
             grayscale_image = convert_to_grayscale([image])[0]
             normalized_image = normalize_images([grayscale_image])[0]
             extracted_text = perform_ocr([normalized_image])[0]
-            processed_text = process_text([extracted_text])
+
+            # Apply user-defined whitelist, blacklist, and corrections
+            corrected_text = correct_common_ocr_mistakes(extracted_text, custom_corrections)
+            processed_text = process_text([corrected_text])
             padded_columns = pad_columns(processed_text)
-            clean_data = remove_special_characters(padded_columns)
+            clean_data = remove_special_characters(
+                padded_columns,
+                whitelist=args.whitelist,
+                blacklist=args.blacklist
+            )
             df = create_dataframe(clean_data)
 
             # Save DataFrame to output directory with a unique filename