-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
58 lines (41 loc) · 2.09 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
from zipfile import ZipFile
from src.ContentExtractor import ContentExtractor
from src.PDFDataExtractor import PDFDataExtractor
from src.utils.functions import setup_output_csv, delete_directory as cleanup
from src.utils.colors import *
#Path to the directory containing the input PDFs
input_folder_path = './res'
#Path to the directory where the output CSV will be saved
output_folder_path = './out'
#Path to the output CSV
output_file_path = f'{output_folder_path}/result.csv'
if __name__ == '__main__':
intermediate = './temp'
#Setting up the output CSV
setup_output_csv(output_file_path)
num_files = len(os.listdir(input_folder_path))
#Iterating over files in the input directory
for index, filename in enumerate(os.listdir(input_folder_path)):
print(f'{yellow}{index+1:>4}/{num_files:<4}\t {filename:<13}\t\t Processing{reset}', end='')
file = os.path.join(input_folder_path, filename)
if os.path.isfile(file):
#Extracting JSON and table data CSVs from the PDF using Adobe ExtractPDF API
pdf_extractor = PDFDataExtractor(file, f'{intermediate}.zip')
pdf_extractor.set_credentials('./pdfservices-api-credentials.json')
pdf_extractor.initialize_operation()
pdf_extractor.set_ExtractPDF_options()
pdf_extractor.extract()
#Unzipping the output from the API to an intermediate directory
with ZipFile(f'{intermediate}.zip', 'r') as zip:
zip.extractall(intermediate)
#Deleting the original zip file
pdf_extractor.cleanup()
#Extracting contents from the outputs of the API
content_extractor = ContentExtractor(f'{intermediate}')
content_extractor.extract()
content_extractor.save_extracted_content(output_file_path)
#Deleting the intermediate directory with contents
cleanup(intermediate)
print(f'\r{green}{index+1:>4}/{num_files:<4}\t {filename:<13}\t\t Processed {reset}')
print(f'All {num_files} files extracted successfully!')