Skip to content

Commit

Permalink
TLDR-872 rewrite benchmark correctness (#510)
Browse files Browse the repository at this point in the history
  • Loading branch information
oksidgy authored Dec 3, 2024
1 parent 6a60e97 commit 0d964e4
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 63 deletions.
21 changes: 0 additions & 21 deletions resources/benchmarks/benchmarks_tl_correctness.json

This file was deleted.

27 changes: 27 additions & 0 deletions resources/benchmarks/benchmarks_tl_correctness.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
Version =

--- Balanced Accuracy --- = 0.843482905982906
--- Accuracy --- = 0.9534883720930233
--- Weighted --- Precision = 0.9519564983695847, Recall=0.9534883720930233, F1=0.9525762106576597
--- Class corrected --- : Precision = 0.9703389830508474, Recall=0.9786324786324786, F1=0.9744680851063829
--- Class incorrected --- : Precision = 0.7727272727272727, Recall=0.7083333333333334, F1=0.7391304347826088
--- AVG Time corrected pdfs --- = 3.2058254999992175
--- AVG Time incorrected pdfs --- = 4.9308231472969055
--- AVG Time all pdfs --- = 3.3662903974222584


--- Failed corrected pdfs --- :
hogans-federal-motion-for-a-preliminary-injunction_1616093696_24.pdf
demystifying-nge-rock-ridge_1643518222_537.pdf
b96a__usmc-combat-camera-directory.pdf
afcea-spy.pdf
access-the-vision-for-2013.pdf

--- Failed incorrected pdfs --- :
Gromov_Dubova_-_Primenenie_metodov_TFKP_k_vychisleniyu_opredelennykh_integralov.pdf
PE157_1616278053_181.pdf
ЧММФ_Абакумов_учебник.pdf
EXTERNAL FORMS - SUPPORTING DOCUMENTATION-ESHS9615401 2017_07_27 11_22_39_1616049888_455.pdf
slides.pdf
PE20_1616439522_1.pdf
Catalog-2020_dealers mail (1).pdf
123 changes: 81 additions & 42 deletions scripts/benchmark_tl_correctness.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import json
import os
import zipfile
from collections import OrderedDict, namedtuple
from time import time

import numpy as np
import requests
import wget
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

from dedoc.config import get_config
from dedoc.utils.utils import send_file

path_result = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources", "benchmarks"))
os.makedirs(path_result, exist_ok=True)
path_result = os.path.join(path_result, "benchmarks_tl_correctness.json")
path_result = os.path.join(path_result, "benchmarks_tl_correctness.txt")

"""
Experiments are available -> https://github.com/alexander1999-hub/txt_layer_correctness/tree/main :
Expand All @@ -24,37 +25,10 @@
"""

host = "http://localhost:1231"
param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed"))


def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, parameters: dict) -> namedtuple:
failed = []
total_incorrect_files = 0
directory = os.path.join(path_base, tl_path)
files_list = [file_name for file_name in os.listdir(directory) if file_name.endswith(".pdf")]
total_file_size = len(files_list)
print(f"Files: {files_list}\nFiles number: {total_file_size}")
for file in tqdm(files_list):
file_path = os.path.join(directory, file)
r = send_file(host=host, file_name=file, file_path=file_path, parameters=parameters)

found = False
for warning in r["warnings"]:
if warning.find(tl_type) != -1:
found = True
break

if found:
total_incorrect_files += 1
failed.append(file)
return param_dist_errors(total_file_size, total_incorrect_files, failed)


if __name__ == "__main__":
data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data")
os.makedirs(data_dir, exist_ok=True)
def download_dataset(data_dir: str) -> str:
benchmark_data_dir = os.path.join(data_dir, "data_with_text_layer")

if not os.path.isdir(benchmark_data_dir):
path_out = os.path.join(data_dir, "data_with_text_layer.zip")
wget.download("https://at.ispras.ru/owncloud/index.php/s/axacSYXf7YCLcbb/download", path_out)
Expand All @@ -67,20 +41,85 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para

assert os.path.isdir(benchmark_data_dir)

result = OrderedDict()
result["version"] = requests.get(f"{host}/version").text
return benchmark_data_dir


def get_metrics(max_eval_pdf: int = 10000) -> None:
data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data")
os.makedirs(data_dir, exist_ok=True)

data_dir = download_dataset(data_dir)

folder = os.path.join(data_dir, "data_correct_text_layer")
correct_files = np.array([os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith(".pdf")])
folder = os.path.join(data_dir, "data_incorrect_text_layer")
incorrect_files = np.array([os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith(".pdf")])

files = np.append(correct_files, incorrect_files)

labels = np.empty(files.size)
labels[:correct_files.size] = 0 # "correct"
labels[correct_files.size:] = 1 # "incorrect"

failed_corrected_pdfs = []
failed_incorrected_pdfs = []

# run pipeline for prediction
predicts = np.empty(files.size)
parameters = dict(pdf_with_text_layer="auto", pages="1:1")
result_item = OrderedDict()
times_correct, times_incorrect = [], []

count = min(max_eval_pdf, len(files))

for i, file_path in enumerate(tqdm(files[:count])):
file_name = file_path.split("/")[-1]

time_b = time()
r = send_file(host=host, file_name=file_name, file_path=file_path, parameters=parameters)
time_eval = time() - time_b

if labels[i] == 0:
times_correct.append(time_eval)
else:
times_incorrect.append(time_eval)

predicts[i] = 3 # "failed" not handling
for warning in r["warnings"]:
if "has incorrect textual layer" in warning:
predicts[i] = 1 # "incorrect"
if "has a correct textual layer" in warning:
predicts[i] = 0 # "correct"

incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, " incorrect ", "data_correct_text_layer", parameters)
result_item["percentage_of_guessed_correct_tl"] = 1 - incorrect_tl_result.total_incorrect_files / incorrect_tl_result.total_file_size
result_item["list_of_file_with_incorrect_tl"] = incorrect_tl_result.failed
if predicts[i] != labels[i]:
failed_corrected_pdfs.append(file_name) if labels[i] == 0 else failed_incorrected_pdfs.append(file_name)

correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, " correct ", "data_incorrect_text_layer", parameters)
result_item["percentage_of_guessed_incorrect_tl"] = 1 - correct_tl_result.total_incorrect_files / correct_tl_result.total_file_size
result_item["list_of_file_with_correct_tl"] = correct_tl_result.failed
result["guessing_the_correctness_of_the_text"] = result_item
labels, predicts = labels[:count], predicts[:count]

b_accuracy = balanced_accuracy_score(labels, predicts)
accuracy = accuracy_score(labels, predicts)
w_avg = precision_recall_fscore_support(labels, predicts, average="weighted")
avg = precision_recall_fscore_support(labels, predicts, average=None, labels=[0, 1])

output = f"Version = {requests.get(host + '/version').text}\n\n"

output += f"--- Balanced Accuracy --- = {b_accuracy}\n"
output += f"--- Accuracy --- = {accuracy}\n"
output += f"--- Weighted --- Precision = {w_avg[0]}, Recall={w_avg[1]}, F1={w_avg[2]}\n"
output += f"--- Class corrected --- : Precision = {avg[0][0]}, Recall={avg[1][0]}, F1={avg[2][0]}\n"
output += f"--- Class incorrected --- : Precision = {avg[0][1]}, Recall={avg[1][1]}, F1={avg[2][1]}\n"

output += f"--- AVG Time corrected pdfs --- = {np.mean(times_correct)}\n"
output += f"--- AVG Time incorrected pdfs --- = {np.mean(times_incorrect)}\n"
output += f"--- AVG Time all pdfs --- = {np.mean(times_correct + times_incorrect)}\n"

output += "\n\n--- Failed corrected pdfs --- : \n" + '\n'.join(failed_corrected_pdfs) # noqa
output += "\n\n--- Failed incorrected pdfs --- : \n" + '\n'.join(failed_incorrected_pdfs) # noqa

print(output)
with open(path_result, "w") as file_out:
json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
file_out.write(output)
print(f"Save result in {path_result}")


if __name__ == "__main__":
get_metrics()

0 comments on commit 0d964e4

Please sign in to comment.