main.py

from modules.racist_text_query import racist_text_query
from modules.bigotry_dict import bigotry_dict
from modules.OCR import tiff_to_ocr
from modules.racist_chatgpt_analysis import racist_chatgpt_analysis
from modules.locate import locate
from modules.pagenum import crop_image
import os
import pandas as pd

def racism_threshold(file_dir):
    # Create the new folder for cropped images
    cropped_images_dir = os.path.join(file_dir, 'deed page number')
    if not os.path.exists(cropped_images_dir):
        os.makedirs(cropped_images_dir)

    data = []
    for images in os.listdir(file_dir):
        if images.endswith(".tif") or images.endswith(".tiff"):
            image_path = os.path.join(file_dir, images)

            # run ocr on images
            text = tiff_to_ocr(image_path)

            result1 = racist_chatgpt_analysis(text)
            result2 = racist_text_query(text, bigotry_dict)

            a, b, c = locate(text)
            
            # Define the output path for the cropped image in the new folder
            cropped_image_name = "cropped_" + images
            cropped_image_path = os.path.join(cropped_images_dir, cropped_image_name)
            
            # Crop the image and save it to the new folder
            crop_image(image_path, cropped_image_path)
            
            image_path_formatted = cropped_image_path
            #.replace(' ', '%20')
            hyperlink_formula = f'file://{image_path_formatted}'
            
            # fail safe page number detection 
            page = tiff_to_ocr(cropped_image_path)
            fail_safe_page = []
            result = page.split("\n")
            for word in result:
              # checks for possible page numbers
              if word.isdigit() == True:
                fail_safe_page.append(word)
            
            
            if result1 or result2:
                print(images, a, b, c)
                if len(fail_safe_page) != 0:
                    a.append(fail_safe_page)
                data.append([images, a, b[0], c[0], hyperlink_formula])
            else:
                print(images + " : Not Racist")
                # data.append([images, a, b[0], c[0], hyperlink_formula])

    # Include the hyperlink in the DataFrame columns
    df = pd.DataFrame(data, columns=['File Name', 'Probable Page Number', 'Date', 'Book Number', "Page Link"])
    df.index += 1
    df.to_csv(os.path.join(file_dir, 'Racist Deeds.csv'), index=True)
    df.to_excel(os.path.join(file_dir, 'Racist Deeds.xlsx'), index=True)

racism_threshold('folderpath')