Skip to content

Commit

Permalink
Fixed bugs in grdrive_workspace
Browse files Browse the repository at this point in the history
  • Loading branch information
rohitcoder committed Dec 7, 2023
1 parent 5098af7 commit 0fda18e
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 33 deletions.
20 changes: 20 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Use the official Python image as the base image
FROM python:3

# Set the working directory in the container
WORKDIR /app

# Copy the local requirements.txt file to the container at /app
COPY requirements.txt /app/

# Install the dependencies from requirements.txt
RUN pip3 install --no-cache-dir -r requirements.txt

# Copy the local code to the container at /app
COPY . /app/

# Install the Python package (assuming it contains a setup.py file)
RUN pip3 install .

# Set the entrypoint to hawk_scanner
ENTRYPOINT ["hawk_scanner"]
64 changes: 38 additions & 26 deletions hawk_scanner/commands/gdrive_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,35 +23,53 @@ def connect_google_drive(credentials_file, impersonate_user=None):
print(f"Failed to connect to Google Drive: {e}")

def download_file(drive, file_obj, base_path):
print(f"Downloading file: {file_obj['name']} to {base_path}")
try:
file_name = file_obj['name']
file_id = file_obj['id']

folder_path = base_path

# Handle parents (folders)
if 'parents' in file_obj:
for parent_id in file_obj['parents']:
parent_folder = drive.files().get(fileId=parent_id).execute()
if parent_folder['name'] == 'My Drive':
continue
folder_path = os.path.join(folder_path, parent_folder['name'])
parent_folder_name = parent_folder['name']

# Update folder_path to include the parent folder
folder_path = os.path.join(folder_path, parent_folder_name)

file_path = os.path.join(folder_path, file_name)
# Update folder_path to include the current file's name
folder_path = os.path.join(folder_path, file_name)

if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.folder':
if not os.path.exists(file_path):
os.makedirs(file_path)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
folder_files = drive.files().list(q=f"'{file_id}' in parents").execute().get('files', [])
for folder_file in folder_files:
download_file(drive, folder_file, folder_path)
else:
download_url = drive.files().get_media(fileId=file_id).execute()
with open(file_path, 'wb') as fh:
fh.write(download_url)

system.print_debug(f"File downloaded to: {file_path}")
try:
# Check if the file is a Google Docs type
if 'application/vnd.google-apps' in file_obj.get('mimeType', ''):
# For Google Docs Editors files, use export instead of GetMedia
response = drive.files().export(fileId=file_id, mimeType='application/pdf').execute()
with open(folder_path, 'wb') as f:
f.write(response)
else:
# For other file types, use GetMedia
content = drive.files().get_media(fileId=file_id).execute()
with open(folder_path, 'wb') as f:
f.write(content)
except Exception as e:
print(f"Failed to write file: {e}")

system.print_debug(f"File downloaded to: {folder_path}")
except Exception as e:
print(f"Failed to download file: {e}")



def list_files(drive, impersonate_user=None):
try:
query = "'root' in parents"
Expand Down Expand Up @@ -88,20 +106,13 @@ def execute(args):
if drive:
files = list_files(drive, impersonate_user)
for file_obj in files:
download_file(drive, file_obj, "data/google_drive")

if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.document' or file_obj['mimeType'] == 'application/vnd.google-apps.spreadsheet' or file_obj['mimeType'] == 'application/vnd.google-apps.presentation' or file_obj['mimeType'] == 'application/vnd.google-apps.drawing' or file_obj['mimeType'] == 'application/vnd.google-apps.script':
file_obj['name'] = file_obj['name'] + '-runtime.pdf'

file_id = file_obj['id']
file_name = file_obj['name']
if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.folder':
continue

parent_folder_ids = file_obj.get('parents', [])
folder_path = "data/google_drive"
if parent_folder_ids:
for parent_id in parent_folder_ids:
parent_folder = drive.files().get(fileId=parent_id).execute()
if parent_folder['name'] == 'My Drive':
continue
folder_path = os.path.join(folder_path, parent_folder['name'])

file_path = os.path.join(folder_path, file_name)

Expand All @@ -115,9 +126,10 @@ def execute(args):
is_cache_enabled = True

if is_cache_enabled:
download_file(drive, file_obj, "data/google_drive")
download_file(drive, file_obj, "data/google_drive/")

matches = system.read_match_strings(file_path, 'gdrive')
matches = system.read_match_strings(file_path, 'gdrive_workspace')
file_name = file_name.replace('-runtime.pdf', '')
if matches:
for match in matches:
results.append({
Expand All @@ -136,8 +148,8 @@ def execute(args):
else:
system.print_error("No Google Drive connection details found in connection file")

if not is_cache_enabled:
os.system("rm -rf data/google_drive")
"""if not is_cache_enabled:
os.system("rm -rf data/google_drive")"""

return results

Expand Down
51 changes: 46 additions & 5 deletions hawk_scanner/internals/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
import json, requests, argparse, yaml, re, datetime, os, subprocess, platform, hashlib
from tinydb import TinyDB, Query
import pytesseract
from PIL import Image
from PIL import Image, ImageEnhance
from docx import Document
from openpyxl import load_workbook
import PyPDF2
import patoolib
import tempfile
import shutil
import os
import os, cv2
import tarfile

# Create a TinyDB instance for storing previous alert hashes
Expand Down Expand Up @@ -256,10 +256,12 @@ def read_match_strings(file_path, source):

try:
# Check if the file is an image
print(file_path)
if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
# Use OCR to extract text from the image
image = Image.open(file_path)
content = pytesseract.image_to_string(image)
print("ocr started for "+file_path)
content = enhance_and_ocr(file_path)
print("texts")
print(content)
# Check if the file is a PDF document
elif file_path.lower().endswith('.pdf'):
content = read_pdf(file_path)
Expand Down Expand Up @@ -411,3 +413,42 @@ def SlackNotify(msg):
db.insert({'msg_hash': msg_hash})
except Exception as e:
print_error(f"An error occurred: {str(e)}")

def enhance_and_ocr(image_path):
# Load the image
original_image = Image.open(image_path)

# Enhance the image (you can adjust enhancement factors as needed)
enhanced_image = enhance_image(original_image)

# Save the enhanced image for reference
enhanced_image.save("enhanced_image.png")

# Perform OCR on the enhanced image
ocr_text = perform_ocr(enhanced_image)

return ocr_text

def enhance_image(image):
# Convert to grayscale
grayscale_image = image.convert('L')

# Increase contrast
contrast_enhancer = ImageEnhance.Contrast(grayscale_image)
contrast_factor = 2.0 # Adjust as needed
contrast_enhanced_image = contrast_enhancer.enhance(contrast_factor)

# Apply thresholding
threshold_value = 100 # Adjust as needed
thresholded_image = contrast_enhanced_image.point(lambda x: 0 if x < threshold_value else 255)

# Reduce noise (optional)
denoised_image = cv2.fastNlMeansDenoising(np.array(thresholded_image), None, h=10, templateWindowSize=7, searchWindowSize=21)

return Image.fromarray(denoised_image)

def perform_ocr(image):
# Use Tesseract OCR
ocr_text = pytesseract.image_to_string(image)

return ocr_text
5 changes: 5 additions & 0 deletions hawk_scanner/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ def main():

if args.json:
with open(args.json, 'w') as file:
#file_path = file_path.replace('-runtime.pdf', '')
if 'gdrive_workspace' in grouped_results:
for result in grouped_results['gdrive_workspace']:
result['file_name'] = result['file_name'].replace('-runtime.pdf', '')

file.write(json.dumps(grouped_results, indent=4))
system.print_success(f"Results saved to {args.json}")
sys.exit(0)
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ pydrive2
appdirs
tqdm
funcy
fsspec
fsspec
opencv-python
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = "0.3.6"
VERSION = "0.3.7"

from setuptools import setup, find_packages

Expand Down

0 comments on commit 0fda18e

Please sign in to comment.