Fixed bugs in grdrive_workspace

rohitcoder · Dec 7, 2023 · 0fda18e · 0fda18e
1 parent 5098af7
commit 0fda18e
Show file tree

Hide file tree

Showing 6 changed files with 112 additions and 33 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,20 @@
+# Use the official Python image as the base image
+FROM python:3
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the local requirements.txt file to the container at /app
+COPY requirements.txt /app/
+
+# Install the dependencies from requirements.txt
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Copy the local code to the container at /app
+COPY . /app/
+
+# Install the Python package (assuming it contains a setup.py file)
+RUN pip3 install .
+
+# Set the entrypoint to hawk_scanner
+ENTRYPOINT ["hawk_scanner"]
diff --git a/hawk_scanner/commands/gdrive_workspace.py b/hawk_scanner/commands/gdrive_workspace.py
@@ -23,35 +23,53 @@ def connect_google_drive(credentials_file, impersonate_user=None):
         print(f"Failed to connect to Google Drive: {e}")
 
 def download_file(drive, file_obj, base_path):
+    print(f"Downloading file: {file_obj['name']} to {base_path}")
     try:
         file_name = file_obj['name']
         file_id = file_obj['id']
 
         folder_path = base_path
+
+        # Handle parents (folders)
         if 'parents' in file_obj:
             for parent_id in file_obj['parents']:
                 parent_folder = drive.files().get(fileId=parent_id).execute()
-                if parent_folder['name'] == 'My Drive':
-                    continue
-                folder_path = os.path.join(folder_path, parent_folder['name'])
+                parent_folder_name = parent_folder['name']
+
+                # Update folder_path to include the parent folder
+                folder_path = os.path.join(folder_path, parent_folder_name)
 
-        file_path = os.path.join(folder_path, file_name)
+        # Update folder_path to include the current file's name
+        folder_path = os.path.join(folder_path, file_name)
 
         if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.folder':
-            if not os.path.exists(file_path):
-                os.makedirs(file_path)
+            if not os.path.exists(folder_path):
+                os.makedirs(folder_path)
             folder_files = drive.files().list(q=f"'{file_id}' in parents").execute().get('files', [])
             for folder_file in folder_files:
                 download_file(drive, folder_file, folder_path)
         else:
-            download_url = drive.files().get_media(fileId=file_id).execute()
-            with open(file_path, 'wb') as fh:
-                fh.write(download_url)
-
-        system.print_debug(f"File downloaded to: {file_path}")
+            try:
+                # Check if the file is a Google Docs type
+                if 'application/vnd.google-apps' in file_obj.get('mimeType', ''):
+                    # For Google Docs Editors files, use export instead of GetMedia
+                    response = drive.files().export(fileId=file_id, mimeType='application/pdf').execute()
+                    with open(folder_path, 'wb') as f:
+                        f.write(response)
+                else:
+                    # For other file types, use GetMedia
+                    content = drive.files().get_media(fileId=file_id).execute()
+                    with open(folder_path, 'wb') as f:
+                        f.write(content)
+            except Exception as e:
+                print(f"Failed to write file: {e}")
+
+        system.print_debug(f"File downloaded to: {folder_path}")
     except Exception as e:
         print(f"Failed to download file: {e}")
 
+
+
 def list_files(drive, impersonate_user=None):
     try:
         query = "'root' in parents"
@@ -88,20 +106,13 @@ def execute(args):
                 if drive:
                     files = list_files(drive, impersonate_user)
                     for file_obj in files:
-                        download_file(drive, file_obj, "data/google_drive")
+
+                        if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.document' or file_obj['mimeType'] == 'application/vnd.google-apps.spreadsheet' or file_obj['mimeType'] == 'application/vnd.google-apps.presentation' or file_obj['mimeType'] == 'application/vnd.google-apps.drawing' or file_obj['mimeType'] == 'application/vnd.google-apps.script':
+                            file_obj['name'] = file_obj['name'] + '-runtime.pdf'
+
                         file_id = file_obj['id']
                         file_name = file_obj['name']
-                        if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.folder':
-                            continue
-
-                        parent_folder_ids = file_obj.get('parents', [])
                         folder_path = "data/google_drive"
-                        if parent_folder_ids:
-                            for parent_id in parent_folder_ids:
-                                parent_folder = drive.files().get(fileId=parent_id).execute()
-                                if parent_folder['name'] == 'My Drive':
-                                    continue
-                                folder_path = os.path.join(folder_path, parent_folder['name'])
 
                         file_path = os.path.join(folder_path, file_name)
 
@@ -115,9 +126,10 @@ def execute(args):
                             is_cache_enabled = True
 
                         if is_cache_enabled:
-                            download_file(drive, file_obj, "data/google_drive")
+                            download_file(drive, file_obj, "data/google_drive/")
 
-                        matches = system.read_match_strings(file_path, 'gdrive')
+                        matches = system.read_match_strings(file_path, 'gdrive_workspace')
+                        file_name = file_name.replace('-runtime.pdf', '')
                         if matches:
                             for match in matches:
                                 results.append({
@@ -136,8 +148,8 @@ def execute(args):
     else:
         system.print_error("No Google Drive connection details found in connection file")
 
-    if not is_cache_enabled:
-        os.system("rm -rf data/google_drive")
+    """if not is_cache_enabled:
+        os.system("rm -rf data/google_drive")"""
 
     return results
 

diff --git a/hawk_scanner/internals/system.py b/hawk_scanner/internals/system.py
@@ -3,14 +3,14 @@
 import json, requests, argparse, yaml, re, datetime, os, subprocess, platform, hashlib
 from tinydb import TinyDB, Query
 import pytesseract
-from PIL import Image
+from PIL import Image, ImageEnhance
 from docx import Document
 from openpyxl import load_workbook
 import PyPDF2
 import patoolib
 import tempfile
 import shutil
-import os
+import os, cv2
 import tarfile
 
 # Create a TinyDB instance for storing previous alert hashes
@@ -256,10 +256,12 @@ def read_match_strings(file_path, source):
 
     try:
         # Check if the file is an image
+        print(file_path)
         if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
-            # Use OCR to extract text from the image
-            image = Image.open(file_path)
-            content = pytesseract.image_to_string(image)
+            print("ocr started for "+file_path)
+            content = enhance_and_ocr(file_path)
+            print("texts")
+            print(content)
         # Check if the file is a PDF document
         elif file_path.lower().endswith('.pdf'):
             content = read_pdf(file_path)
@@ -411,3 +413,42 @@ def SlackNotify(msg):
                     db.insert({'msg_hash': msg_hash})
             except Exception as e:
                 print_error(f"An error occurred: {str(e)}")
+
+def enhance_and_ocr(image_path):
+    # Load the image
+    original_image = Image.open(image_path)
+
+    # Enhance the image (you can adjust enhancement factors as needed)
+    enhanced_image = enhance_image(original_image)
+
+    # Save the enhanced image for reference
+    enhanced_image.save("enhanced_image.png")
+
+    # Perform OCR on the enhanced image
+    ocr_text = perform_ocr(enhanced_image)
+
+    return ocr_text
+
+def enhance_image(image):
+    # Convert to grayscale
+    grayscale_image = image.convert('L')
+
+    # Increase contrast
+    contrast_enhancer = ImageEnhance.Contrast(grayscale_image)
+    contrast_factor = 2.0  # Adjust as needed
+    contrast_enhanced_image = contrast_enhancer.enhance(contrast_factor)
+
+    # Apply thresholding
+    threshold_value = 100  # Adjust as needed
+    thresholded_image = contrast_enhanced_image.point(lambda x: 0 if x < threshold_value else 255)
+
+    # Reduce noise (optional)
+    denoised_image = cv2.fastNlMeansDenoising(np.array(thresholded_image), None, h=10, templateWindowSize=7, searchWindowSize=21)
+
+    return Image.fromarray(denoised_image)
+
+def perform_ocr(image):
+    # Use Tesseract OCR
+    ocr_text = pytesseract.image_to_string(image)
+
+    return ocr_text
diff --git a/hawk_scanner/main.py b/hawk_scanner/main.py
@@ -72,6 +72,11 @@ def main():
 
     if args.json:
         with open(args.json, 'w') as file:
+            #file_path = file_path.replace('-runtime.pdf', '')
+            if 'gdrive_workspace' in grouped_results:
+                for result in grouped_results['gdrive_workspace']:
+                    result['file_name'] = result['file_name'].replace('-runtime.pdf', '')
+
             file.write(json.dumps(grouped_results, indent=4))
         system.print_success(f"Results saved to {args.json}")
         sys.exit(0)

diff --git a/requirements.txt b/requirements.txt
@@ -20,4 +20,5 @@ pydrive2
 appdirs
 tqdm
 funcy
-fsspec
+fsspec
+opencv-python
diff --git a/setup.py b/setup.py
@@ -1,4 +1,4 @@
-VERSION = "0.3.6"
+VERSION = "0.3.7"
 
 from setuptools import setup, find_packages
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,4 +20,5 @@ pydrive2 @@
     appdirs
     tqdm
     funcy
-    fsspec
+    fsspec
+    opencv-python