-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
65 lines (52 loc) · 2.51 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from modules.racist_text_query import racist_text_query
from modules.bigotry_dict import bigotry_dict
from modules.OCR import tiff_to_ocr
from modules.racist_chatgpt_analysis import racist_chatgpt_analysis
from modules.locate import locate
from modules.pagenum import crop_image
import os
import pandas as pd
def racism_threshold(file_dir):
# Create the new folder for cropped images
cropped_images_dir = os.path.join(file_dir, 'deed page number')
if not os.path.exists(cropped_images_dir):
os.makedirs(cropped_images_dir)
data = []
for images in os.listdir(file_dir):
if images.endswith(".tif") or images.endswith(".tiff"):
image_path = os.path.join(file_dir, images)
# run ocr on images
text = tiff_to_ocr(image_path)
result1 = racist_chatgpt_analysis(text)
result2 = racist_text_query(text, bigotry_dict)
a, b, c = locate(text)
# Define the output path for the cropped image in the new folder
cropped_image_name = "cropped_" + images
cropped_image_path = os.path.join(cropped_images_dir, cropped_image_name)
# Crop the image and save it to the new folder
crop_image(image_path, cropped_image_path)
image_path_formatted = cropped_image_path
#.replace(' ', '%20')
hyperlink_formula = f'file://{image_path_formatted}'
# fail safe page number detection
page = tiff_to_ocr(cropped_image_path)
fail_safe_page = []
result = page.split("\n")
for word in result:
# checks for possible page numbers
if word.isdigit() == True:
fail_safe_page.append(word)
if result1 or result2:
print(images, a, b, c)
if len(fail_safe_page) != 0:
a.append(fail_safe_page)
data.append([images, a, b[0], c[0], hyperlink_formula])
else:
print(images + " : Not Racist")
# data.append([images, a, b[0], c[0], hyperlink_formula])
# Include the hyperlink in the DataFrame columns
df = pd.DataFrame(data, columns=['File Name', 'Probable Page Number', 'Date', 'Book Number', "Page Link"])
df.index += 1
df.to_csv(os.path.join(file_dir, 'Racist Deeds.csv'), index=True)
df.to_excel(os.path.join(file_dir, 'Racist Deeds.xlsx'), index=True)
racism_threshold('folderpath')