From 0d964e47d5a865ef6d247ed12695cc127071f4e9 Mon Sep 17 00:00:00 2001
From: Oksana Belyaeva <belyaeva@ispras.ru>
Date: Tue, 3 Dec 2024 16:07:58 +0300
Subject: [PATCH] TLDR-872 rewrite benchmark correctness (#510)

---
 .../benchmarks/benchmarks_tl_correctness.json |  21 ---
 .../benchmarks/benchmarks_tl_correctness.txt  |  27 ++++
 scripts/benchmark_tl_correctness.py           | 123 ++++++++++++------
 3 files changed, 108 insertions(+), 63 deletions(-)
 delete mode 100644 resources/benchmarks/benchmarks_tl_correctness.json
 create mode 100644 resources/benchmarks/benchmarks_tl_correctness.txt

diff --git a/resources/benchmarks/benchmarks_tl_correctness.json b/resources/benchmarks/benchmarks_tl_correctness.json
deleted file mode 100644
index f3fee769..00000000
--- a/resources/benchmarks/benchmarks_tl_correctness.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-    "version": "0.11.2",
-    "guessing_the_correctness_of_the_text": {
-        "percentage_of_guessed_correct_tl": 0.9785407725321889,
-        "list_of_file_with_incorrect_tl": [
-            "hogans-federal-motion-for-a-preliminary-injunction_1616093696_24.pdf",
-            "afcea-spy.pdf",
-            "b96a__usmc-combat-camera-directory.pdf",
-            "access-the-vision-for-2013.pdf",
-            "demystifying-nge-rock-ridge_1643518222_537.pdf"
-        ],
-        "percentage_of_guessed_incorrect_tl": 0.7916666666666666,
-        "list_of_file_with_correct_tl": [
-            "PE20_1616439522_1.pdf",
-            "slides.pdf",
-            "PE157_1616278053_181.pdf",
-            "EXTERNAL FORMS - SUPPORTING DOCUMENTATION-ESHS9615401 2017_07_27 11_22_39_1616049888_455.pdf",
-            "╨º╨£╨£╨ñ_╨É╨▒╨░╨║╤â╨╝╨╛╨▓_╤â╤ç╨╡╨▒╨╜╨╕╨║.pdf"
-        ]
-    }
-}
\ No newline at end of file
diff --git a/resources/benchmarks/benchmarks_tl_correctness.txt b/resources/benchmarks/benchmarks_tl_correctness.txt
new file mode 100644
index 00000000..351e9fd8
--- /dev/null
+++ b/resources/benchmarks/benchmarks_tl_correctness.txt
@@ -0,0 +1,27 @@
+Version = 
+
+--- Balanced Accuracy --- = 0.843482905982906
+--- Accuracy --- = 0.9534883720930233
+--- Weighted --- Precision = 0.9519564983695847, Recall=0.9534883720930233, F1=0.9525762106576597
+--- Class corrected --- : Precision = 0.9703389830508474, Recall=0.9786324786324786, F1=0.9744680851063829
+--- Class incorrected --- : Precision = 0.7727272727272727, Recall=0.7083333333333334, F1=0.7391304347826088
+--- AVG Time corrected pdfs --- = 3.2058254999992175
+--- AVG Time incorrected pdfs --- = 4.9308231472969055
+--- AVG Time all pdfs --- = 3.3662903974222584
+
+
+--- Failed corrected pdfs --- : 
+hogans-federal-motion-for-a-preliminary-injunction_1616093696_24.pdf
+demystifying-nge-rock-ridge_1643518222_537.pdf
+b96a__usmc-combat-camera-directory.pdf
+afcea-spy.pdf
+access-the-vision-for-2013.pdf
+
+--- Failed incorrected pdfs --- : 
+Gromov_Dubova_-_Primenenie_metodov_TFKP_k_vychisleniyu_opredelennykh_integralov.pdf
+PE157_1616278053_181.pdf
+╨º╨£╨£╨ñ_╨É╨▒╨░╨║╤â╨╝╨╛╨▓_╤â╤ç╨╡╨▒╨╜╨╕╨║.pdf
+EXTERNAL FORMS - SUPPORTING DOCUMENTATION-ESHS9615401 2017_07_27 11_22_39_1616049888_455.pdf
+slides.pdf
+PE20_1616439522_1.pdf
+Catalog-2020_dealers mail (1).pdf
\ No newline at end of file
diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py
index 2538cdef..5469f309 100644
--- a/scripts/benchmark_tl_correctness.py
+++ b/scripts/benchmark_tl_correctness.py
@@ -1,10 +1,11 @@
-import json
 import os
 import zipfile
-from collections import OrderedDict, namedtuple
+from time import time
 
+import numpy as np
 import requests
 import wget
+from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_recall_fscore_support
 from tqdm import tqdm
 
 from dedoc.config import get_config
@@ -12,7 +13,7 @@
 
 path_result = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources", "benchmarks"))
 os.makedirs(path_result, exist_ok=True)
-path_result = os.path.join(path_result, "benchmarks_tl_correctness.json")
+path_result = os.path.join(path_result, "benchmarks_tl_correctness.txt")
 
 """
 Experiments are available -> https://github.com/alexander1999-hub/txt_layer_correctness/tree/main :
@@ -24,37 +25,10 @@
 """
 
 host = "http://localhost:1231"
-param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed"))
 
 
-def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, parameters: dict) -> namedtuple:
-    failed = []
-    total_incorrect_files = 0
-    directory = os.path.join(path_base, tl_path)
-    files_list = [file_name for file_name in os.listdir(directory) if file_name.endswith(".pdf")]
-    total_file_size = len(files_list)
-    print(f"Files: {files_list}\nFiles number: {total_file_size}")
-    for file in tqdm(files_list):
-        file_path = os.path.join(directory, file)
-        r = send_file(host=host, file_name=file, file_path=file_path, parameters=parameters)
-
-        found = False
-        for warning in r["warnings"]:
-            if warning.find(tl_type) != -1:
-                found = True
-                break
-
-        if found:
-            total_incorrect_files += 1
-            failed.append(file)
-    return param_dist_errors(total_file_size, total_incorrect_files, failed)
-
-
-if __name__ == "__main__":
-    data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data")
-    os.makedirs(data_dir, exist_ok=True)
+def download_dataset(data_dir: str) -> str:
     benchmark_data_dir = os.path.join(data_dir, "data_with_text_layer")
-
     if not os.path.isdir(benchmark_data_dir):
         path_out = os.path.join(data_dir, "data_with_text_layer.zip")
         wget.download("https://at.ispras.ru/owncloud/index.php/s/axacSYXf7YCLcbb/download", path_out)
@@ -67,20 +41,85 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
 
     assert os.path.isdir(benchmark_data_dir)
 
-    result = OrderedDict()
-    result["version"] = requests.get(f"{host}/version").text
+    return benchmark_data_dir
+
+
+def get_metrics(max_eval_pdf: int = 10000) -> None:
+    data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data")
+    os.makedirs(data_dir, exist_ok=True)
+
+    data_dir = download_dataset(data_dir)
+
+    folder = os.path.join(data_dir, "data_correct_text_layer")
+    correct_files = np.array([os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith(".pdf")])
+    folder = os.path.join(data_dir, "data_incorrect_text_layer")
+    incorrect_files = np.array([os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith(".pdf")])
+
+    files = np.append(correct_files, incorrect_files)
+
+    labels = np.empty(files.size)
+    labels[:correct_files.size] = 0  # "correct"
+    labels[correct_files.size:] = 1  # "incorrect"
+
+    failed_corrected_pdfs = []
+    failed_incorrected_pdfs = []
+
+    # run pipeline for prediction
+    predicts = np.empty(files.size)
     parameters = dict(pdf_with_text_layer="auto", pages="1:1")
-    result_item = OrderedDict()
+    times_correct, times_incorrect = [], []
+
+    count = min(max_eval_pdf, len(files))
+
+    for i, file_path in enumerate(tqdm(files[:count])):
+        file_name = file_path.split("/")[-1]
+
+        time_b = time()
+        r = send_file(host=host, file_name=file_name, file_path=file_path, parameters=parameters)
+        time_eval = time() - time_b
+
+        if labels[i] == 0:
+            times_correct.append(time_eval)
+        else:
+            times_incorrect.append(time_eval)
+
+        predicts[i] = 3  # "failed" not handling
+        for warning in r["warnings"]:
+            if "has incorrect textual layer" in warning:
+                predicts[i] = 1  # "incorrect"
+            if "has a correct textual layer" in warning:
+                predicts[i] = 0  # "correct"
 
-    incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, " incorrect ", "data_correct_text_layer", parameters)
-    result_item["percentage_of_guessed_correct_tl"] = 1 - incorrect_tl_result.total_incorrect_files / incorrect_tl_result.total_file_size
-    result_item["list_of_file_with_incorrect_tl"] = incorrect_tl_result.failed
+        if predicts[i] != labels[i]:
+            failed_corrected_pdfs.append(file_name) if labels[i] == 0 else failed_incorrected_pdfs.append(file_name)
 
-    correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, " correct ", "data_incorrect_text_layer", parameters)
-    result_item["percentage_of_guessed_incorrect_tl"] = 1 - correct_tl_result.total_incorrect_files / correct_tl_result.total_file_size
-    result_item["list_of_file_with_correct_tl"] = correct_tl_result.failed
-    result["guessing_the_correctness_of_the_text"] = result_item
+    labels, predicts = labels[:count], predicts[:count]
 
+    b_accuracy = balanced_accuracy_score(labels, predicts)
+    accuracy = accuracy_score(labels, predicts)
+    w_avg = precision_recall_fscore_support(labels, predicts, average="weighted")
+    avg = precision_recall_fscore_support(labels, predicts, average=None, labels=[0, 1])
+
+    output = f"Version = {requests.get(host + '/version').text}\n\n"
+
+    output += f"--- Balanced Accuracy --- = {b_accuracy}\n"
+    output += f"--- Accuracy --- = {accuracy}\n"
+    output += f"--- Weighted --- Precision = {w_avg[0]}, Recall={w_avg[1]}, F1={w_avg[2]}\n"
+    output += f"--- Class corrected --- : Precision = {avg[0][0]}, Recall={avg[1][0]}, F1={avg[2][0]}\n"
+    output += f"--- Class incorrected --- : Precision = {avg[0][1]}, Recall={avg[1][1]}, F1={avg[2][1]}\n"
+
+    output += f"--- AVG Time corrected pdfs --- = {np.mean(times_correct)}\n"
+    output += f"--- AVG Time incorrected pdfs --- = {np.mean(times_incorrect)}\n"
+    output += f"--- AVG Time all pdfs --- = {np.mean(times_correct + times_incorrect)}\n"
+
+    output += "\n\n--- Failed corrected pdfs --- : \n" + '\n'.join(failed_corrected_pdfs)  # noqa
+    output += "\n\n--- Failed incorrected pdfs --- : \n" + '\n'.join(failed_incorrected_pdfs)  # noqa
+
+    print(output)
     with open(path_result, "w") as file_out:
-        json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
+        file_out.write(output)
     print(f"Save result in {path_result}")
+
+
+if __name__ == "__main__":
+    get_metrics()