From 463c74dbe0f20145ff974420b9ffef7b39ed4ee1 Mon Sep 17 00:00:00 2001
From: JasonGellis <jasonjgellis@gmail.com>
Date: Fri, 6 Sep 2024 12:26:15 +0100
Subject: [PATCH] update functions to deal with white list and blacklist

---
 cli/read_and_process.py | 149 ++++++++++++++--------------------------
 1 file changed, 53 insertions(+), 96 deletions(-)

diff --git a/cli/read_and_process.py b/cli/read_and_process.py
index 5f6427c..cd87a16 100644
--- a/cli/read_and_process.py
+++ b/cli/read_and_process.py
@@ -1,11 +1,11 @@
-""" functions for importing and processing images. """
-import re
+"""
+Functions for importing and processing images.
+"""
 import os
 import cv2
 import pandas as pd
 import pytesseract
 
-
 def read_images(input_dir):
     """
     Import images from a directory.
@@ -17,38 +17,37 @@ def read_images(input_dir):
         images (list): List of imported images.
     """
     images = []
-    # Iterate over files in the directory
     for filename in os.listdir(input_dir):
         # Check if the file is an image
         if filename.endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif')):
-            # Read the image using OpenCV
             image_path = os.path.join(input_dir, filename)
             image = cv2.imread(image_path)
             if image is not None:  # Check if the image was successfully loaded
                 images.append(image)
-            else:  # Raise an error if the image failed to load
+            else:
                 raise ValueError(f"Failed to load image: {image_path}")
     return images
 
 
 def convert_to_grayscale(images):
     """
-    Convert imported image or list of imported images to grayscale.
+    Convert imported images to grayscale.
 
     Parameters:
-        images (numpy.ndarray or list): Input image or list of input images.
+        images (list or numpy.ndarray): Input images or list of images.
 
     Returns:
         grayscale_images (list): List of grayscale images.
     """
-    if isinstance(images, list):  # Check if images is a list
-        grayscale_images = []  # Initialize an empty list to store grayscale images
+    if isinstance(images, list):
+        grayscale_images = []
         for image in images:
-            grayscale_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # Convert image to grayscale
+            grayscale_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
             grayscale_images.append(grayscale_image)
 
     return grayscale_images
 
+
 def normalize_images(grayscale_images):
     """
     Normalize a list of grayscale images to stretch contrast.
@@ -59,26 +58,16 @@ def normalize_images(grayscale_images):
     Returns:
         normalized_images (list): List of normalized images.
     """
-    # Initialize an empty list to store normalized images
     normalized_images = []
-
-    # Iterate over each grayscale image in the input list
     for grayscale_image in grayscale_images:
-        # Calculate the minimum and maximum intensity values of the image
         min_intensity = min(grayscale_image.ravel())
         max_intensity = max(grayscale_image.ravel())
-
-        # Normalize the image to stretch contrast
-        normalized_image = cv2.normalize(grayscale_image,
-                                         None,
-                                         min_intensity,
-                                         max_intensity,
-                                         cv2.NORM_MINMAX)
-
+        normalized_image = cv2.normalize(grayscale_image, None, min_intensity, max_intensity, cv2.NORM_MINMAX)
         normalized_images.append(normalized_image)
 
     return normalized_images
 
+
 def perform_ocr(normalized_images):
     """
     Perform OCR (optical character recognition) on a list of images with enhanced
@@ -91,59 +80,51 @@ def perform_ocr(normalized_images):
         extracted_text (list): List of extracted text from each image.
     """
     extracted_text = []
-
-    # Custom Tesseract configuration without strict whitelisting
     custom_config = (
-        r'--oem 3 --psm 6 '  # Use LSTM OCR Engine and assume a block of text (table)
-        r'-c preserve_interword_spaces=1'  # Preserve spacing between words (useful for tables)
+        r'--oem 3 --psm 6 '  # Use LSTM OCR Engine and assume a block of text
+        r'-c preserve_interword_spaces=1'  # Preserve spacing between words
     )
 
     for normal_image in normalized_images:
-        # Perform OCR with custom configuration
         text = pytesseract.image_to_string(normal_image, config=custom_config, lang='eng')
-
-        # Post-process text to handle common OCR issues
         corrected_text = correct_common_ocr_mistakes(text)
         extracted_text.append(corrected_text)
 
     return extracted_text
 
-def correct_common_ocr_mistakes(text):
+
+def correct_common_ocr_mistakes(text, custom_corrections=None):
     """
     Correct common OCR mistakes related to special characters, superscripts, and subscripts.
 
     Parameters:
         text (str): Text output from OCR.
+        custom_corrections (dict): User-specified corrections for specific character combinations.
 
     Returns:
         corrected_text (str): Text with common OCR mistakes corrected.
     """
-    # List of characters that might be misrecognized as mid-dots or similar
-    mid_dot_variants = ['·', '•', 'o', '°', '˙']  # Potential misrecognitions of mid-dot
-
-    for variant in mid_dot_variants:
-        text = text.replace(variant, '.')
-
-    # Dictionary to correct OCR misinterpretations
     corrections = {
         ',': ',',  # Ensure decimal comma is preserved
-        '--': '—',  # Convert double hyphen to em dash (if OCR mistakes are found)
+        '--': '—',  # Convert double hyphen to em dash
         '-': '-',  # Ensure standard hyphen is preserved
-        # Example: OCR misinterpreting '10' as '1o'
-        '1o': '10', '2o': '20', '3o': '30','4o': '40','5o': '50',
-        '6o': '60', '7o': '70', '8o': '80','9o': '90','0o': '00',
-       # Example correction for letters that look like numbers
-        'S1': '51', 'S2': '52','S3': '53','S4': '54','S5': '55',
-        'S6': '56','S7': '57','S8': '58','S9': '59','S0': '50',
-        '·': '.',  # Ensure mid-dot conversion to decimal
-        '–': '—',  # Ensure en dash is preserved (or converted to em dash if needed)
+        '1o': '10', '2o': '20', '3o': '30', '4o': '40', '5o': '50',
+        '6o': '60', '7o': '70', '8o': '80', '9o': '90', '0o': '00',
+        'S1': '51', 'S2': '52', 'S3': '53', 'S4': '54', 'S5': '55',
+        'S6': '56', 'S7': '57', 'S8': '58', 'S9': '59', 'S0': '50',
+        '·': '.',  # Convert mid-dot to decimal point
+        '–': '—',  # Ensure en dash is preserved or converted to em dash
     }
 
+    if custom_corrections:
+        corrections.update(custom_corrections)
+
     for wrong, correct in corrections.items():
         text = text.replace(wrong, correct)
 
     return text
 
+
 def process_text(extracted_text):
     """
     Create a data structure using extracted text from images.
@@ -154,90 +135,70 @@ def process_text(extracted_text):
     Returns:
         data (list): Processed data structure.
     """
-    # Initialize an empty list to store processed data
     processed_data = []
-
-    # Iterate over each extracted text
     for text in extracted_text:
-        # Split the text into lines
         lines = text.strip().split('\n')
-
-        # Split each line into words
         words = [line.split() for line in lines]
-
-        # Append the processed data to the main list
         processed_data.append(words)
 
     return processed_data
 
+
 def pad_columns(processed_data):
     """
     Pad rows with fewer columns to match the maximum number of columns in the data.
 
     Parameters:
-        data (list): Processed data structure.
+        processed_data (list): Processed data structure.
 
     Returns:
         padded_data (list): Data structure with padded rows.
     """
-    # Find the maximum number of columns in the data
     max_columns = max(len(row) for row in processed_data)
-
-    # Pad rows with fewer columns
     padded_data = [row + [''] * (max_columns - len(row)) for row in processed_data]
 
     return padded_data
 
-def remove_special_characters(data, exceptions=None):
+
+def remove_special_characters(data, whitelist=None, blacklist=None):
     """
-    Remove specified characters from a dataset, excluding decimal values and English letters.
+    Remove or retain specified characters from a dataset based on the whitelist and blacklist.
 
     Parameters:
-        data (list or DataFrame): Dataset to process.
-        exceptions (list): List of characters to exempt from removal (default is ['.', ',']).
+        data (list): Processed data structure (list of lists).
+        whitelist (str): A string of characters to retain.
+        blacklist (str): A string of characters to remove.
 
     Returns:
-        cleaned_data (list or DataFrame): Dataset with specified characters removed.
+        cleaned_data (list): Data structure with specified characters removed or retained.
     """
-    if exceptions is None:
-        exceptions = []  # Initialize default exceptions list inside the function
-
-    if isinstance(data, list):
-        # If data is a list, process each element
-        cleaned_data = [remove_special_characters(element, exceptions) for element in data]
-    elif hasattr(data, 'applymap'):
-        # If data is a DataFrame, process each cell
-        cleaned_data = data.applymap(lambda x: remove_special_characters(x, exceptions))
-    elif isinstance(data, str):
-        # If data is a string, remove specified characters
-        exceptions_regex = ''.join([re.escape(char) for char in exceptions])
-        pattern = f'[^0-9{exceptions_regex}a-zA-Z]'
-        cleaned_data = re.sub(pattern, '', data)
-    else:
-        # For other data types, return as is
-        cleaned_data = data
+    def process_string(text):
+        if whitelist:
+            text = ''.join([char for char in text if char in whitelist])
+        if blacklist:
+            text = ''.join([char for char in text if char not in blacklist])
+        return text
+
+    cleaned_data = []
+    for row in data:
+        cleaned_row = [process_string(element) for element in row]
+        cleaned_data.append(cleaned_row)
 
     return cleaned_data
 
 
 def create_dataframe(cleaned_data):
     """
-    Create a DataFrame from the padded data structure.
+    Create a DataFrame from the cleaned and padded data structure.
 
     Parameters:
-        padded_data (list): Data structure with padded rows.
+        cleaned_data (list): Data structure with cleaned and padded rows.
 
     Returns:
-        df (DataFrame): DataFrame created from the padded data structure.
+        pd.DataFrame: DataFrame created from the data structure.
     """
-
-    # Flatten each sublist within padded_data and create a flat list
     flattened_data = [item for sublist in cleaned_data for item in sublist]
-
-    # Create DataFrame from the flattened data
     df = pd.DataFrame(flattened_data, dtype='object')
-
-    # Ensure that each value is in its own cell
     df = df.map(lambda x: x[0] if isinstance(x, list) else x)
     return df
 
@@ -247,16 +208,12 @@ def save_dataframe_to_directory(dataframe, output_dir, file_name):
     Save a DataFrame to a file in a specified directory.
 
     Parameters:
-        dataframe (pandas.DataFrame): The DataFrame to be saved.
-        directory (str): Path to the directory where the DataFrame will be saved.
+        dataframe (pd.DataFrame): The DataFrame to be saved.
+        output_dir (str): Path to the directory where the DataFrame will be saved.
         file_name (str): Name of the file (including extension) to save the DataFrame.
     """
-    # Create the directory if it doesn't exist
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 
-    # Construct the file path
     file_path = os.path.join(output_dir, file_name)
-
-    # Save DataFrame to file
     dataframe.to_csv(file_path, index=False)