Skip to content

Commit

Permalink
update --help descriptions
Browse files Browse the repository at this point in the history
  • Loading branch information
JasonGellis committed Sep 6, 2024
1 parent 7274779 commit 6942584
Showing 1 changed file with 109 additions and 40 deletions.
149 changes: 109 additions & 40 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,43 @@
""" Module for Table Reader main function and argparse """
"""
Table Reader - A command-line tool for processing tables from images and
outputting data frames. This script defines the CLI interface, processes images,
and applies OCR with custom options like whitelist, blacklist, and character corrections.
"""

import argparse
import os
import logging
import cv2
from cli.read_and_process import convert_to_grayscale, \
normalize_images, perform_ocr, process_text, pad_columns, \
remove_special_characters, create_dataframe, save_dataframe_to_directory

from cli.read_and_process import (
convert_to_grayscale,
normalize_images,
perform_ocr,
process_text,
pad_columns,
remove_special_characters,
create_dataframe,
save_dataframe_to_directory,
correct_common_ocr_mistakes
)
import config

def setup_logging():
"""
Set up logging configuration.
Set up logging configuration to output logs to the console.
"""
logging.basicConfig(
level=logging.INFO, # Change to DEBUG for more detailed output
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler() # Log to console
]
handlers=[logging.StreamHandler()]
)

def parse_arguments():
"""
Parse command-line arguments for the Table Reader application.
Parse command-line arguments for Table Reader, providing options
for input/output directories, whitelist, blacklist, and character corrections.
Returns:
argparse.Namespace: Parsed command-line arguments.
"""
parser = argparse.ArgumentParser(
description=(
Expand All @@ -34,40 +47,64 @@ def parse_arguments():
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

# Input and output directories
parser.add_argument(
"-i", "--input-dir",
help="Path to the input directory containing images to process.",
default=config.DEFAULT_INPUT_DIR,
required=True
default=None,
required=False
)

parser.add_argument(
"-o", "--output-dir",
help="Path to the output directory where processed data frames will be saved as CSV files.",
default=config.DEFAULT_OUTPUT_DIR,
required=True
default=None,
required=False
)

# New arguments
# Whitelist and blacklist arguments
parser.add_argument(
"-w", "--whitelist",
help="Characters to whitelist (preserve) in OCR output.",
default="", # No whitelist by default
help=(
'Restrict the OCR output to only include certain characters. '
'For example, to only allow numeric characters and common punctuation: '
'"0123456789.,".'
),
default="",
)

parser.add_argument(
"-b", "--blacklist",
help="Characters to remove from OCR output.",
default="", # No blacklist by default
help=(
'Remove specific characters from the OCR output. For example, to remove '
'common punctuation marks like commas and periods: ",."'
),
default="",
)

# Custom character corrections argument
parser.add_argument(
"-c", "--char-corrections",
help="Custom character corrections for misread letters, numbers, or combinations of - e.g., 'S1:51, S2:52'.",
default="", # No custom corrections by default
help="Custom character corrections in the format: wrong:correct, e.g., 'S1:51,S2:52'.",
default="",
)

# Argument to show default corrections
parser.add_argument(
"-d", "--show-default-corrections",
action="store_true",
help="Show the default character corrections used by Table Reader."
)

args = parser.parse_args()

# Conditionally require input-dir and output-dir unless showing default corrections
if not args.show_default_corrections:
if args.input_dir is None:
parser.error("--input-dir is required unless --show-default-corrections is specified.")
if args.output_dir is None:
parser.error("--output-dir is required unless --show-default-corrections is specified.")

return args

def parse_char_corrections(correction_string):
Expand All @@ -89,25 +126,47 @@ def parse_char_corrections(correction_string):
corrections[key] = value
return corrections

def main():
def show_default_corrections():
"""
Process multiple images and save output data frames to CSV files.
Parses command-line arguments to set input and output directories.
Reads images from the input directory, processes each image, and
saves the extracted data to separate CSV files in the output directory.
Args:
None (Uses command-line arguments for input and output directories)
Returns:
None
Display the default corrections used by the Table Reader.
"""
default_corrections = {
',': ',', # Ensure decimal comma is preserved
'--': '—', # Convert double hyphen to em dash
'-': '-', # Ensure standard hyphen is preserved
'1o': '10', '2o': '20', '3o': '30', '4o': '40', '5o': '50',
'6o': '60', '7o': '70', '8o': '80', '9o': '90', '0o': '00',
'S1': '51', 'S2': '52', 'S3': '53', 'S4': '54', 'S5': '55',
'S6': '56', 'S7': '57', 'S8': '58', 'S9': '59', 'S0': '50',
'·': '.', # Ensure mid-dot conversion to decimal
'–': '—', # Ensure en dash is preserved or converted to em dash
}

print("By default, Table Reader includes these OCR Corrections:")
for wrong, correct in default_corrections.items():
print(f" '{wrong}' -> '{correct}'")
print(
"You can provide your own custom character corrections to Table Reader "
"using the --char-corrections flag in the format: wrong:correct."
)

def main():
"""
Main function to process images and output data frames.
It handles image preprocessing, OCR, and saving the final CSV output.
"""
setup_logging()
args = parse_arguments()

# Set input and output directories in config module
# If the user requests to show default corrections, display them and exit
if args.show_default_corrections:
show_default_corrections()
return

# Parse custom character corrections
custom_corrections = parse_char_corrections(args.char_corrections)

# Set input and output directories in the config module
config.set_input_directory(args.input_dir)
config.set_output_directory(args.output_dir)

Expand All @@ -116,8 +175,11 @@ def main():
logging.error("Input directory %s does not exist.", args.input_dir)
return

# Read all images from the input directory
image_files = [f for f in os.listdir(args.input_dir) if f.endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'))]
# Read all valid images from the input directory
image_files = [
f for f in os.listdir(args.input_dir)
if f.endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'))
]

if not image_files:
logging.warning("No valid image files found in the input directory: %s", args.input_dir)
Expand All @@ -129,7 +191,7 @@ def main():
logging.info("Processing image: %s", image_path)

try:
# Read the image directly using OpenCV
# Read the image using OpenCV
image = cv2.imread(image_path)
if image is None:
logging.warning("Unable to read image: %s", image_path)
Expand All @@ -139,9 +201,16 @@ def main():
grayscale_image = convert_to_grayscale([image])[0]
normalized_image = normalize_images([grayscale_image])[0]
extracted_text = perform_ocr([normalized_image])[0]
processed_text = process_text([extracted_text])

# Apply user-defined whitelist, blacklist, and corrections
corrected_text = correct_common_ocr_mistakes(extracted_text, custom_corrections)
processed_text = process_text([corrected_text])
padded_columns = pad_columns(processed_text)
clean_data = remove_special_characters(padded_columns)
clean_data = remove_special_characters(
padded_columns,
whitelist=args.whitelist,
blacklist=args.blacklist
)
df = create_dataframe(clean_data)

# Save DataFrame to output directory with a unique filename
Expand Down

0 comments on commit 6942584

Please sign in to comment.