Wazzabeee · jokerale · Apr 18, 2024 · May 8, 2024 · Jun 17, 2024
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # Copy Spotter
-
+![GIF demo](data/img/example.gif)
 ![PyPI - Version](https://img.shields.io/pypi/v/copy-spotter) ![PyPI - License](https://img.shields.io/pypi/l/copy-spotter)
 ![Python](https://img.shields.io/badge/python-3.11-blue)
 
@@ -16,6 +16,10 @@ This program will process pdf, txt, docx, and odt files that can be found in the
 $ pip install copy-spotter
 $ copy-spotter [-s] [-o] [-h] input_directory
 ```
+Usage: python -m scripts.main.py input_directory [OPTIONS]
+
+  Performs a similarity analysis of all text files available in given input directory.
+  Developed by Clément Delteil -> (Github: Wazzabeee)
 ***Positional Arguments:***
 * `input_directory`: One directory that contains all files (pdf, txt, docx, odt) (see `data/pdf/plagiarism` for example)
 
@@ -54,12 +58,30 @@ $ cd copy_spotter
 
 # Install requirements
 $ pip install -r requirements.txt
+
+# Run the app
+$ python -m scripts.main.py data/pdf/plagiarism -s 2
+```
+**First run**
+---
+On the first run you might get :
+- an ImportError from pdfminer library 
+``` 
+ImportError: cannot import name 'uint_value' from 'pdfminer.pdftypes' (C:/.../pdfminer/pdftypes.py)
+```
+To fix this, please uninstall pdfminer3k and pdfminer.six via 
+``` pip uninstall pdfminer3k ```
+``` pip uninstall pdfminer.six ```
+Then install them again via 
+``` pip install pdfminer3k ```
+``` pip install pdfminer.six ```
 $ pip install -r requirements_lint.txt
 
 # Install precommit
 $ pip install pre-commit
 $ pre-commit install
 
+
 # Run tests
 $ pip install pytest
 $ pytest tests/
@@ -78,6 +100,11 @@ $ python -m scripts.main [-s] [-o] [-h] input_directory
 
 **TODO**
 ---
+- Add more tests
+- Add info in console for timing (tqdm)
+- Add CSS to HTML Template
+- Add support for other folder structures
+- Fix Slate3k by installing custom fork
 - Add more tests on existing functions
 - Implement OCR with tesseract for scanned documents
-- Add custom naming option for pdf files
+- Add custom naming option for pdf files
diff --git a/data/pdf/pdf_tests/TestPDFfile_flat.pdf b/data/pdf/pdf_tests/TestPDFfile_flat.pdf
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/requirements.txt b/requirements.txt
@@ -2,5 +2,7 @@ beautifulsoup4==4.10.0
 nltk==3.6.6
 odfpy==1.4.1
 tabulate==0.8.9
+pytesseract==0.3.10
+pdf2image==1.17.0
 tqdm==4.66.3
-pdfminer.six==20200517
+pdfminer.six==20200517
diff --git a/scripts/html_writing.py b/scripts/html_writing.py
@@ -7,7 +7,7 @@
 
 """
 
-from os import fsync, path
+from os import fsync, rename, path
 from random import randint
 from shutil import copyfile, copy
 from typing import Any, List

diff --git a/scripts/main.py b/scripts/main.py
@@ -11,9 +11,7 @@
 from datetime import datetime
 from os import listdir, path
 from typing import List
-
 from tqdm import tqdm
-
 from scripts.html_writing import add_links_to_html_table, results_to_html, papers_comparison
 from scripts.html_utils import writing_results
 from scripts.processing_files import file_extension_call

diff --git a/scripts/processing_files.py b/scripts/processing_files.py
@@ -4,6 +4,9 @@
 import zipfile
 from os import path
 
+import slate3k as slate
+import pytesseract
+from pdf2image import convert_from_path
 from odf import text, teletype
 from odf.opendocument import load
 from pdfminer.high_level import extract_text
@@ -47,10 +50,20 @@ def get_words_from_pdf_file(pdf_path: str) -> list:
     cleaned_text = re.sub(r"\s+", " ", extracted_text)
     cleaned_text = re.sub(r"<(.|\n)*?>", "", cleaned_text)
 
-    # Extract words from the cleaned text
-    words = re.findall(r"\w+", cleaned_text.lower())
+    # Convert the pdfs into images
+    pages = convert_from_path(pdf_path, 300)
 
-    return words
+    extracted_text = ''
+
+    # Iterate over every page
+    for page in pages:
+        # Extract the tex from the current page using OCR
+        text = pytesseract.image_to_string(page, lang='eng')
+
+        # Concat the extracted text
+        extracted_text += text
+
+    return extracted_text.replace("\xa0", " ").strip().split()
 
 
 def get_words_from_txt_file(txt_path: str) -> list:

diff --git a/setup.py b/setup.py
@@ -14,6 +14,8 @@ def get_version():
         "beautifulsoup4==4.10.0",
         "nltk==3.6.6",
         "odfpy==1.4.1",
+        "pdfplumber==0.5.28",
+        "slate3k==0.5.3",
         "tabulate==0.8.9",
         "tqdm==4.66.3",
         "pdfminer.six==20200517",
@@ -44,4 +46,5 @@ def get_version():
     package_data={
         "scripts": ["template.html"],
     },
+
 )