From 6c19c2c5a033e52d43f7baf9e496d8ca00ba34ba Mon Sep 17 00:00:00 2001 From: Rohit kumar Date: Thu, 11 Jan 2024 10:42:28 +0530 Subject: [PATCH] Resetting changes --- Dockerfile | 5 +- hawk_scanner/commands/couchdb.py | 57 ++++---- hawk_scanner/commands/firebase.py | 125 +++++++++--------- hawk_scanner/commands/fs.py | 82 ++++++------ hawk_scanner/commands/gcs.py | 118 +++++++++-------- hawk_scanner/commands/gdrive.py | 152 +++++++++++----------- hawk_scanner/commands/gdrive_workspace.py | 140 ++++++++++---------- hawk_scanner/commands/mongodb.py | 79 +++++------ hawk_scanner/commands/mysql.py | 61 ++++----- hawk_scanner/commands/postgresql.py | 71 +++++----- hawk_scanner/commands/redis.py | 45 ++++--- hawk_scanner/commands/s3.py | 128 +++++++++--------- hawk_scanner/commands/slack.py | 52 ++++---- hawk_scanner/commands/text.py | 19 ++- hawk_scanner/internals/system.py | 79 ++++++----- hawk_scanner/main.py | 21 +-- setup.py | 2 +- 17 files changed, 642 insertions(+), 594 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8f87b7a..0f5d283 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,6 @@ WORKDIR /app # Copy the local requirements.txt file to the container at /app COPY requirements.txt /app/ -RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y # Install the dependencies from requirements.txt RUN pip3 install --no-cache-dir -r requirements.txt @@ -17,5 +16,5 @@ COPY . /app/ # Install the Python package (assuming it contains a setup.py file) RUN pip3 install . -# Run hawk_Scanner from python3 main.py -ENTRYPOINT ["python3", "main.py"] \ No newline at end of file +# Set the entrypoint to hawk_scanner +ENTRYPOINT ["hawk_scanner"] \ No newline at end of file diff --git a/hawk_scanner/commands/couchdb.py b/hawk_scanner/commands/couchdb.py index 888634a..c0f723d 100644 --- a/hawk_scanner/commands/couchdb.py +++ b/hawk_scanner/commands/couchdb.py @@ -25,7 +25,7 @@ def check_data_patterns(db, patterns, profile_name, database_name): for field_name, field_value in document.items(): if field_value: value_str = str(field_value) - matches = system.match_strings(value_str) + matches = system.analyze_strings(value_str, 'couchdb') if matches: for match in matches: results.append({ @@ -42,36 +42,39 @@ def check_data_patterns(db, patterns, profile_name, database_name): return results -def execute(args): - results = [] - system.print_info(f"Running Checks for CouchDB Sources") - connections = system.get_connection() +def execute(args, programmatic=False): + try: + results = [] + system.print_info(f"Running Checks for CouchDB Sources") + connections = system.get_connection(args, programmatic) - if 'sources' in connections: - sources_config = connections['sources'] - couchdb_config = sources_config.get('couchdb') + if 'sources' in connections: + sources_config = connections['sources'] + couchdb_config = sources_config.get('couchdb') - if couchdb_config: - patterns = system.get_fingerprint_file() + if couchdb_config: + patterns = system.get_fingerprint_file(args, programmatic) - for key, config in couchdb_config.items(): - host = config.get('host') - port = config.get('port', 5984) # default CouchDB port - username = config.get('username') - password = config.get('password') - database = config.get('database') + for key, config in couchdb_config.items(): + host = config.get('host') + port = config.get('port', 5984) # default CouchDB port + username = config.get('username') + password = config.get('password') + database = config.get('database') - if host and username and password and database: - system.print_info(f"Checking CouchDB Profile {key} with host and authentication") - else: - system.print_error(f"Incomplete CouchDB configuration for key: {key}") - continue + if host and username and password and database: + system.print_info(f"Checking CouchDB Profile {key} with host and authentication") + else: + system.print_error(f"Incomplete CouchDB configuration for key: {key}") + continue - db = connect_couchdb(host, port, username, password, database) - if db: - results += check_data_patterns(db, patterns, key, database) + db = connect_couchdb(host, port, username, password, database) + if db: + results += check_data_patterns(db, patterns, key, database) + else: + system.print_error("No CouchDB connection details found in connection.yml") else: - system.print_error("No CouchDB connection details found in connection.yml") - else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error("No 'sources' section found in connection.yml") + except Exception as e: + system.print_error(f"Failed to execute CouchDB checks with error: {e}") return results diff --git a/hawk_scanner/commands/firebase.py b/hawk_scanner/commands/firebase.py index 64553c7..9774b25 100644 --- a/hawk_scanner/commands/firebase.py +++ b/hawk_scanner/commands/firebase.py @@ -15,75 +15,78 @@ def connect_firebase(credentials_file, bucket_name): except Exception as e: print(f"Failed to connect to Firebase bucket: {e}") -def execute(args): - results = [] - shouldDownload = True - connections = system.get_connection() +def execute(args, programmatic=False): + try: + results = [] + shouldDownload = True + connections = system.get_connection(args, programmatic) + fingerprints = system.get_fingerprint_file(args, programmatic) - if 'sources' in connections: - sources_config = connections['sources'] - firebase_config = sources_config.get('firebase') + if 'sources' in connections: + sources_config = connections['sources'] + firebase_config = sources_config.get('firebase') - if firebase_config: - for key, config in firebase_config.items(): - credentials_file = config.get('credentials_file') - bucket_name = config.get('bucket_name') - exclude_patterns = config.get(key, {}).get('exclude_patterns', []) + if firebase_config: + for key, config in firebase_config.items(): + credentials_file = config.get('credentials_file') + bucket_name = config.get('bucket_name') + exclude_patterns = config.get(key, {}).get('exclude_patterns', []) - if credentials_file and bucket_name: - bucket = connect_firebase(credentials_file, bucket_name) - if bucket: - for blob in bucket.list_blobs(): - file_name = blob.name - ## get unique etag or hash of file - remote_etag = blob.etag - system.print_debug(f"Remote etag: {remote_etag}") + if credentials_file and bucket_name: + bucket = connect_firebase(credentials_file, bucket_name) + if bucket: + for blob in bucket.list_blobs(): + file_name = blob.name + ## get unique etag or hash of file + remote_etag = blob.etag + system.print_debug(f"Remote etag: {remote_etag}") - if system.should_exclude_file(file_name, exclude_patterns): - continue + if system.should_exclude_file(file_name, exclude_patterns): + continue - file_path = f"data/firebase/{remote_etag}-{file_name}" - os.makedirs(os.path.dirname(file_path), exist_ok=True) + file_path = f"data/firebase/{remote_etag}-{file_name}" + os.makedirs(os.path.dirname(file_path), exist_ok=True) - if config.get("cache") == True: - if os.path.exists(file_path): - shouldDownload = False - local_etag = file_path.split('/')[-1].split('-')[0] - system.print_debug(f"Local etag: {local_etag}") - system.print_debug(f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml") - if remote_etag != local_etag: - system.print_debug(f"File in firebase bucket has changed, downloading it again...") - shouldDownload = True - else: + if config.get("cache") == True: + if os.path.exists(file_path): shouldDownload = False + local_etag = file_path.split('/')[-1].split('-')[0] + system.print_debug(f"Local etag: {local_etag}") + system.print_debug(f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml") + if remote_etag != local_etag: + system.print_debug(f"File in firebase bucket has changed, downloading it again...") + shouldDownload = True + else: + shouldDownload = False - if shouldDownload: - file_path = f"data/firebase/{remote_etag}-{file_name}" - system.print_debug(f"Downloading file: {file_name} to {file_path}...") - blob.download_to_filename(file_path) - - matches = system.read_match_strings(file_path, 'google_cloud_storage') - if matches: - for match in matches: - results.append({ - 'bucket': bucket_name, - 'file_path': file_name, - 'pattern_name': match['pattern_name'], - 'matches': match['matches'], - 'sample_text': match['sample_text'], - 'profile': key, - 'data_source': 'firebase' - }) - + if shouldDownload: + file_path = f"data/firebase/{remote_etag}-{file_name}" + system.print_debug(f"Downloading file: {file_name} to {file_path}...") + blob.download_to_filename(file_path) + + matches = system.analyze_file(file_path, 'google_cloud_storage', connections, fingerprints, programmatic=programmatic) + if matches: + for match in matches: + results.append({ + 'bucket': bucket_name, + 'file_path': file_name, + 'pattern_name': match['pattern_name'], + 'matches': match['matches'], + 'sample_text': match['sample_text'], + 'profile': key, + 'data_source': 'firebase' + }) + if config.get("cache") == False: + os.system("rm -rf data/firebase") + else: + system.print_error(f"Failed to connect to Firebase bucket: {bucket_name}") else: - system.print_error(f"Failed to connect to Firebase bucket: {bucket_name}") - else: - system.print_error(f"Incomplete Firebase configuration for key: {key}") + system.print_error(f"Incomplete Firebase configuration for key: {key}") + else: + system.print_error("No Firebase connection details found in connection file") else: - system.print_error("No Firebase connection details found in connection file") - else: - system.print_error("No 'sources' section found in connection.yml") - - if config.get("cache") == False: - os.system("rm -rf data/firebase") + system.print_error("No 'sources' section found in connection.yml") + + except Exception as e: + print(f"Failed to connect to Firebase bucket: {e}") return results diff --git a/hawk_scanner/commands/fs.py b/hawk_scanner/commands/fs.py index 453c021..44e236d 100644 --- a/hawk_scanner/commands/fs.py +++ b/hawk_scanner/commands/fs.py @@ -1,14 +1,12 @@ -import argparse +import argparse, os, concurrent.futures, time from google.cloud import storage from rich.console import Console from hawk_scanner.internals import system -import os -import concurrent.futures -import time -def process_file(file_path, key, results): - matches = system.read_match_strings(file_path, 'fs') +def process_file(file_path, key, connections, fingerprints, programmatic=False): + matches = system.analyze_file(file_path, 'fs', connections, fingerprints, programmatic=programmatic) file_data = system.getFileData(file_path) + results = [] if matches: for match in matches: results.append({ @@ -21,43 +19,49 @@ def process_file(file_path, key, results): 'data_source': 'fs', 'file_data': file_data }) + return results -def execute(args): - results = [] - connections = system.get_connection() +def execute(args, programmatic=False): + try: + results = [] + connections = system.get_connection(args, programmatic) + fingerprints = system.get_fingerprint_file(args, programmatic) - if 'sources' in connections: - sources_config = connections['sources'] - fs_config = sources_config.get('fs') - if fs_config: - for key, config in fs_config.items(): - if 'path' not in config: - system.print_error(f"Path not found in fs profile '{key}'") - continue - path = config.get('path') - if not os.path.exists(path): - system.print_error(f"Path '{path}' does not exist") - - exclude_patterns = fs_config.get(key, {}).get('exclude_patterns', []) - start_time = time.time() - files = system.list_all_files_iteratively(path, exclude_patterns) - - # Use ThreadPoolExecutor for parallel processing - file_count = 0 - with concurrent.futures.ThreadPoolExecutor() as executor: - futures = [] - for file_path in files: - file_count += 1 - futures.append(executor.submit(process_file, file_path, key, results)) + if 'sources' in connections: + sources_config = connections['sources'] + fs_config = sources_config.get('fs') + if fs_config: + for key, config in fs_config.items(): + if 'path' not in config: + system.print_error(f"Path not found in fs profile '{key}'") + continue + path = config.get('path') + if not os.path.exists(path): + system.print_error(f"Path '{path}' does not exist") + + exclude_patterns = fs_config.get(key, {}).get('exclude_patterns', []) + start_time = time.time() + files = system.list_all_files_iteratively(path, exclude_patterns) - # Wait for all tasks to complete - concurrent.futures.wait(futures) - end_time = time.time() - system.print_info(f"Time taken to analyze {file_count} files: {end_time - start_time} seconds") + # Use ThreadPoolExecutor for parallel processing + file_count = 0 + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [] + for file_path in files: + file_count += 1 + results += process_file(file_path, key, connections, fingerprints, programmatic=programmatic) + + # Wait for all tasks to complete + concurrent.futures.wait(futures) + end_time = time.time() + elapsed_time = round(end_time - start_time, 2) + system.print_info(f"Time taken to analyze {file_count} files: {elapsed_time} seconds") + else: + system.print_error("No filesystem 'fs' connection details found in connection.yml") else: - system.print_error("No filesystem 'fs' connection details found in connection.yml") - else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error("No 'sources' section found in connection.yml") + except Exception as e: + system.print_error(f"Error in executing filesystem checks: {e}") return results if __name__ == "__main__": diff --git a/hawk_scanner/commands/gcs.py b/hawk_scanner/commands/gcs.py index c22adf0..058ce00 100644 --- a/hawk_scanner/commands/gcs.py +++ b/hawk_scanner/commands/gcs.py @@ -22,72 +22,76 @@ def get_last_update_time(blob): # Use Google Cloud Blob's etag as the entity tag (ETag) return blob.etag -def execute(args): - results = [] - shouldDownload = True - connections = system.get_connection() +def execute(args, programmatic=False): + try: + results = [] + shouldDownload = True + connections = system.get_connection(args, programmatic) + fingerprints = system.get_fingerprint_file(args, programmatic) - if 'sources' in connections: - sources_config = connections['sources'] - gcs_config = sources_config.get('gcs') + if 'sources' in connections: + sources_config = connections['sources'] + gcs_config = sources_config.get('gcs') - if gcs_config: - for key, config in gcs_config.items(): - bucket_name = config.get('bucket_name') - exclude_patterns = config.get(key, {}).get('exclude_patterns', []) - credentials_file = config.get('credentials_file') + if gcs_config: + for key, config in gcs_config.items(): + bucket_name = config.get('bucket_name') + exclude_patterns = config.get(key, {}).get('exclude_patterns', []) + credentials_file = config.get('credentials_file') - if bucket_name: - bucket = connect_google_cloud(bucket_name, credentials_file) - if bucket: - for blob in bucket.list_blobs(): - file_name = blob.name - ## get unique etag or hash of file - remote_etag = get_last_update_time(blob) - system.print_debug(f"Remote etag: {remote_etag}") + if bucket_name: + bucket = connect_google_cloud(bucket_name, credentials_file) + if bucket: + for blob in bucket.list_blobs(): + file_name = blob.name + ## get unique etag or hash of file + remote_etag = get_last_update_time(blob) + system.print_debug(f"Remote etag: {remote_etag}") - if system.should_exclude_file(file_name, exclude_patterns): - continue + if system.should_exclude_file(file_name, exclude_patterns): + continue - file_path = f"data/google_cloud_storage/{remote_etag}-{file_name}" - os.makedirs(os.path.dirname(file_path), exist_ok=True) + file_path = f"data/google_cloud_storage/{remote_etag}-{file_name}" + os.makedirs(os.path.dirname(file_path), exist_ok=True) - if config.get("cache") == True: - if os.path.exists(file_path): - shouldDownload = False - local_etag = file_path.split('/')[-1].split('-')[0] - system.print_debug(f"Local etag: {local_etag}") - system.print_debug(f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml") - if remote_etag != local_etag: - system.print_debug(f"File in Google Cloud Storage bucket has changed, downloading it again...") - shouldDownload = True - else: + if config.get("cache") == True: + if os.path.exists(file_path): shouldDownload = False + local_etag = file_path.split('/')[-1].split('-')[0] + system.print_debug(f"Local etag: {local_etag}") + system.print_debug(f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml") + if remote_etag != local_etag: + system.print_debug(f"File in Google Cloud Storage bucket has changed, downloading it again...") + shouldDownload = True + else: + shouldDownload = False - if shouldDownload: - system.print_debug(f"Downloading file: {file_name} to {file_path}...") - blob.download_to_filename(file_path) + if shouldDownload: + system.print_debug(f"Downloading file: {file_name} to {file_path}...") + blob.download_to_filename(file_path) - matches = system.read_match_strings(file_path, 'google_cloud_storage') - if matches: - for match in matches: - results.append({ - 'bucket': bucket_name, - 'file_path': file_name, - 'pattern_name': match['pattern_name'], - 'matches': match['matches'], - 'sample_text': match['sample_text'], - 'profile': key, - 'data_source': 'gcs' - }) + matches = system.analyze_file(file_path, 'google_cloud_storage', connections, fingerprints, programmatic=programmatic) + if matches: + for match in matches: + results.append({ + 'bucket': bucket_name, + 'file_path': file_name, + 'pattern_name': match['pattern_name'], + 'matches': match['matches'], + 'sample_text': match['sample_text'], + 'profile': key, + 'data_source': 'gcs' + }) + else: + system.print_error(f"Failed to connect to Google Cloud Storage bucket: {bucket_name}") else: - system.print_error(f"Failed to connect to Google Cloud Storage bucket: {bucket_name}") - else: - system.print_error(f"Incomplete Google Cloud Storage configuration for key: {key}") + system.print_error(f"Incomplete Google Cloud Storage configuration for key: {key}") + else: + system.print_error("No Google Cloud Storage connection details found in connection.yml") else: - system.print_error("No Google Cloud Storage connection details found in connection.yml") - else: - system.print_error("No 'sources' section found in connection.yml") - if config.get("cache") == False: - os.system("rm -rf data/google_cloud_storage") + system.print_error("No 'sources' section found in connection.yml") + if config.get("cache") == False: + os.system("rm -rf data/google_cloud_storage") + except Exception as e: + print(f"Failed to connect to Google Cloud Storage bucket: {e}") return results diff --git a/hawk_scanner/commands/gdrive.py b/hawk_scanner/commands/gdrive.py index c457d18..1603bd2 100644 --- a/hawk_scanner/commands/gdrive.py +++ b/hawk_scanner/commands/gdrive.py @@ -65,82 +65,86 @@ def list_files(drive, folder_name=None): print(f"Error listing files: {e}") return [] -def execute(args): - results = [] - should_download = True - connections = system.get_connection() - is_cache_enabled = False - drive_config = None - - if 'sources' in connections: - sources_config = connections['sources'] - drive_config = sources_config.get('gdrive') - else: - system.print_error("No 'sources' section found in connection.yml") - - if drive_config: - for key, config in drive_config.items(): - credentials_file = config.get('credentials_file') - folder_name = config.get('folder_name') - exclude_patterns = config.get(key, {}).get('exclude_patterns', []) - is_cache_enabled = config.get('cache', False) - drive = connect_google_drive(credentials_file) - if not os.path.exists("data/google_drive"): - os.makedirs("data/google_drive") - if drive: - files = list_files(drive, folder_name=folder_name) - for file_obj in files: - download_file(drive, file_obj, "data/google_drive") - file_id = file_obj['id'] - file_name = file_obj['title'] - if file_obj['mimeType'] == 'application/vnd.google-apps.folder': - continue - - # Construct file_path with the correct folder structure - parent_folder_ids = file_obj['parents'] - folder_path = "data/google_drive" - if parent_folder_ids: - for parent_id in parent_folder_ids: - parent_folder = drive.CreateFile({'id': parent_id['id']}) - if parent_folder['title'] == 'My Drive': - continue - folder_path = os.path.join(folder_path, parent_folder['title']) - - file_path = os.path.join(folder_path, file_name) - - if system.should_exclude_file(file_name, exclude_patterns): - continue - - if config.get("cache") and os.path.exists(file_path): - should_download = False - system.print_debug(f"File already exists in cache, using it.") - else: - should_download = True - - if should_download: - download_file(drive, file_obj, "data/google_drive") +def execute(args, programmatic=False): + try: + results = [] + should_download = True + connections = system.get_connection(args, programmatic) + fingerprints = system.get_fingerprint_file(args, programmatic) - matches = system.read_match_strings(file_path, 'gdrive') - if matches: - for match in matches: - results.append({ - 'file_id': file_id, - 'file_name': file_name, - 'file_path': file_path, - 'pattern_name': match['pattern_name'], - 'matches': match['matches'], - 'sample_text': match['sample_text'], - 'profile': key, - 'data_source': 'gdrive' - }) - else: - system.print_error("Failed to connect to Google Drive") - else: - system.print_error("No Google Drive connection details found in connection file") - - if not is_cache_enabled: - os.system("rm -rf data/google_drive") + is_cache_enabled = False + drive_config = None + + if 'sources' in connections: + sources_config = connections['sources'] + drive_config = sources_config.get('gdrive') + else: + system.print_error("No 'sources' section found in connection.yml") + + if drive_config: + for key, config in drive_config.items(): + credentials_file = config.get('credentials_file') + folder_name = config.get('folder_name') + exclude_patterns = config.get(key, {}).get('exclude_patterns', []) + is_cache_enabled = config.get('cache', False) + drive = connect_google_drive(credentials_file) + if not os.path.exists("data/google_drive"): + os.makedirs("data/google_drive") + if drive: + files = list_files(drive, folder_name=folder_name) + for file_obj in files: + download_file(drive, file_obj, "data/google_drive") + file_id = file_obj['id'] + file_name = file_obj['title'] + if file_obj['mimeType'] == 'application/vnd.google-apps.folder': + continue + + # Construct file_path with the correct folder structure + parent_folder_ids = file_obj['parents'] + folder_path = "data/google_drive" + if parent_folder_ids: + for parent_id in parent_folder_ids: + parent_folder = drive.CreateFile({'id': parent_id['id']}) + if parent_folder['title'] == 'My Drive': + continue + folder_path = os.path.join(folder_path, parent_folder['title']) + + file_path = os.path.join(folder_path, file_name) + + if system.should_exclude_file(file_name, exclude_patterns): + continue + + if config.get("cache") and os.path.exists(file_path): + should_download = False + system.print_debug(f"File already exists in cache, using it.") + else: + should_download = True + + if should_download: + download_file(drive, file_obj, "data/google_drive") + + matches = system.analyze_file(file_path, 'gdrive', connections, fingerprints, programmatic=programmatic) + if matches: + for match in matches: + results.append({ + 'file_id': file_id, + 'file_name': file_name, + 'file_path': file_path, + 'pattern_name': match['pattern_name'], + 'matches': match['matches'], + 'sample_text': match['sample_text'], + 'profile': key, + 'data_source': 'gdrive' + }) + else: + system.print_error("Failed to connect to Google Drive") + else: + system.print_error("No Google Drive connection details found in connection file") + if not is_cache_enabled: + os.system("rm -rf data/google_drive") + except Exception as e: + print(f"Failed to connect to Google Drive: {e}") return results # Call the execute function with the necessary arguments diff --git a/hawk_scanner/commands/gdrive_workspace.py b/hawk_scanner/commands/gdrive_workspace.py index 364782a..3e94f41 100644 --- a/hawk_scanner/commands/gdrive_workspace.py +++ b/hawk_scanner/commands/gdrive_workspace.py @@ -81,76 +81,80 @@ def list_files(drive, impersonate_user=None): print(f"Error listing files: {e}") return [] -def execute(args): - results = [] - connections = system.get_connection() - is_cache_enabled = False - - if 'sources' in connections: - sources_config = connections['sources'] - drive_config = sources_config.get('gdrive_workspace') - else: - system.print_error("No 'sources' section found in connection.yml") - - if drive_config: - for key, config in drive_config.items(): - credentials_file = config.get('credentials_file') - impersonate_users = config.get('impersonate_users', []) - exclude_patterns = config.get(key, {}).get('exclude_patterns', []) - is_cache_enabled = config.get('cache', False) - - for impersonate_user in impersonate_users or [None]: - drive = connect_google_drive(credentials_file, impersonate_user) - if not os.path.exists("data/google_drive"): - os.makedirs("data/google_drive") - if drive: - files = list_files(drive, impersonate_user) - for file_obj in files: - - if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.document' or file_obj['mimeType'] == 'application/vnd.google-apps.spreadsheet' or file_obj['mimeType'] == 'application/vnd.google-apps.presentation' or file_obj['mimeType'] == 'application/vnd.google-apps.drawing' or file_obj['mimeType'] == 'application/vnd.google-apps.script': - file_obj['name'] = file_obj['name'] + '-runtime.pdf' - - file_id = file_obj['id'] - file_name = file_obj['name'] - folder_path = "data/google_drive" - - file_path = os.path.join(folder_path, file_name) - - if system.should_exclude_file(file_name, exclude_patterns): - continue - - if config.get("cache") and os.path.exists(file_path): - is_cache_enabled = False - system.print_debug(f"File already exists in cache, using it.") - else: - is_cache_enabled = True - - if is_cache_enabled: - download_file(drive, file_obj, "data/google_drive/") - - matches = system.read_match_strings(file_path, 'gdrive_workspace') - file_name = file_name.replace('-runtime.pdf', '') - if matches: - for match in matches: - results.append({ - 'file_id': file_id, - 'file_name': file_name, - 'user': impersonate_user, - 'file_path': file_path, - 'pattern_name': match['pattern_name'], - 'matches': match['matches'], - 'sample_text': match['sample_text'], - 'profile': key, - 'data_source': 'gdrive_workspace' - }) - else: - system.print_error("Failed to connect to Google Drive") - else: - system.print_error("No Google Drive connection details found in connection file") +def execute(args, programmatic=False): + try: + results = [] + connections = system.get_connection(args, programmatic) + fingerprints = system.get_fingerprint_file(args, programmatic) + + is_cache_enabled = False - """if not is_cache_enabled: - os.system("rm -rf data/google_drive")""" + if 'sources' in connections: + sources_config = connections['sources'] + drive_config = sources_config.get('gdrive_workspace') + else: + system.print_error("No 'sources' section found in connection.yml") + + if drive_config: + for key, config in drive_config.items(): + credentials_file = config.get('credentials_file') + impersonate_users = config.get('impersonate_users', []) + exclude_patterns = config.get(key, {}).get('exclude_patterns', []) + is_cache_enabled = config.get('cache', False) + + for impersonate_user in impersonate_users or [None]: + drive = connect_google_drive(credentials_file, impersonate_user) + if not os.path.exists("data/google_drive"): + os.makedirs("data/google_drive") + if drive: + files = list_files(drive, impersonate_user) + for file_obj in files: + + if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.document' or file_obj['mimeType'] == 'application/vnd.google-apps.spreadsheet' or file_obj['mimeType'] == 'application/vnd.google-apps.presentation' or file_obj['mimeType'] == 'application/vnd.google-apps.drawing' or file_obj['mimeType'] == 'application/vnd.google-apps.script': + file_obj['name'] = file_obj['name'] + '-runtime.pdf' + + file_id = file_obj['id'] + file_name = file_obj['name'] + folder_path = "data/google_drive" + + file_path = os.path.join(folder_path, file_name) + + if system.should_exclude_file(file_name, exclude_patterns): + continue + + if config.get("cache") and os.path.exists(file_path): + is_cache_enabled = False + system.print_debug(f"File already exists in cache, using it.") + else: + is_cache_enabled = True + + if is_cache_enabled: + download_file(drive, file_obj, "data/google_drive/") + + matches = system.analyze_file(file_path, 'gdrive_workspace', connections, fingerprints, programmatic=programmatic) + file_name = file_name.replace('-runtime.pdf', '') + if matches: + for match in matches: + results.append({ + 'file_id': file_id, + 'file_name': file_name, + 'user': impersonate_user, + 'file_path': file_path, + 'pattern_name': match['pattern_name'], + 'matches': match['matches'], + 'sample_text': match['sample_text'], + 'profile': key, + 'data_source': 'gdrive_workspace' + }) + else: + system.print_error("Failed to connect to Google Drive") + else: + system.print_error("No Google Drive connection details found in connection file") + """if not is_cache_enabled: + os.system("rm -rf data/google_drive")""" + except Exception as e: + print(f"Error executing gdrive_workspace module: {e}") return results # Call the execute function with the necessary arguments diff --git a/hawk_scanner/commands/mongodb.py b/hawk_scanner/commands/mongodb.py index 7ce6359..fd7fcf6 100644 --- a/hawk_scanner/commands/mongodb.py +++ b/hawk_scanner/commands/mongodb.py @@ -42,7 +42,7 @@ def check_data_patterns(db, patterns, profile_name, database_name, limit_start=0 for field_name, field_value in document.items(): if field_value: value_str = str(field_value) - matches = system.match_strings(value_str) + matches = system.analyze_strings(value_str) if matches: for match in matches: results.append({ @@ -59,44 +59,47 @@ def check_data_patterns(db, patterns, profile_name, database_name, limit_start=0 return results -def execute(args): - results = [] - system.print_info(f"Running Checks for MongoDB Sources") - connections = system.get_connection() - - if 'sources' in connections: - sources_config = connections['sources'] - mongodb_config = sources_config.get('mongodb') - - if mongodb_config: - patterns = system.get_fingerprint_file() - - for key, config in mongodb_config.items(): - host = config.get('host') - port = config.get('port', 27017) # default MongoDB port - username = config.get('username') - password = config.get('password') - database = config.get('database') - uri = config.get('uri') # Added support for URI - limit_start = config.get('limit_start', 0) - limit_end = config.get('limit_end', 500) - collections = config.get('collections', []) - - if uri: - system.print_info(f"Checking MongoDB Profile {key} using URI") - elif host and username and password and database: - system.print_info(f"Checking MongoDB Profile {key} with host and authentication") - else: - system.print_error(f"Incomplete MongoDB configuration for key: {key}") - continue - - db = connect_mongodb(host, port, username, password, database, uri) - if db: - results += check_data_patterns(db, patterns, key, database, limit_start=limit_start, limit_end=limit_end, whitelisted_collections=collections) +def execute(args, programmatic=False): + try: + results = [] + system.print_info(f"Running Checks for MongoDB Sources") + connections = system.get_connection(args, programmatic) + + if 'sources' in connections: + sources_config = connections['sources'] + mongodb_config = sources_config.get('mongodb') + + if mongodb_config: + patterns = system.get_fingerprint_file(args, programmatic) + + for key, config in mongodb_config.items(): + host = config.get('host') + port = config.get('port', 27017) # default MongoDB port + username = config.get('username') + password = config.get('password') + database = config.get('database') + uri = config.get('uri') # Added support for URI + limit_start = config.get('limit_start', 0) + limit_end = config.get('limit_end', 500) + collections = config.get('collections', []) + + if uri: + system.print_info(f"Checking MongoDB Profile {key} using URI") + elif host and username and password and database: + system.print_info(f"Checking MongoDB Profile {key} with host and authentication") + else: + system.print_error(f"Incomplete MongoDB configuration for key: {key}") + continue + + db = connect_mongodb(host, port, username, password, database, uri) + if db: + results += check_data_patterns(db, patterns, key, database, limit_start=limit_start, limit_end=limit_end, whitelisted_collections=collections) + else: + system.print_error("No MongoDB connection details found in connection.yml") else: - system.print_error("No MongoDB connection details found in connection.yml") - else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error("No 'sources' section found in connection.yml") + except Exception as e: + system.print_error(f"Failed to run MongoDB checks with error: {e}") return results # Example usage diff --git a/hawk_scanner/commands/mysql.py b/hawk_scanner/commands/mysql.py index 2747dc8..2eb9e36 100644 --- a/hawk_scanner/commands/mysql.py +++ b/hawk_scanner/commands/mysql.py @@ -64,40 +64,43 @@ def check_data_patterns(conn, patterns, profile_name, database_name, limit_start cursor.close() return results -def execute(args): - results = [] - system.print_info(f"Running Checks for MySQL Sources") - connections = system.get_connection() +def execute(args, programmatic=False): + try: + results = [] + system.print_info(f"Running Checks for MySQL Sources") + connections = system.get_connection(args, programmatic) - if 'sources' in connections: - sources_config = connections['sources'] - mysql_config = sources_config.get('mysql') + if 'sources' in connections: + sources_config = connections['sources'] + mysql_config = sources_config.get('mysql') - if mysql_config: - patterns = system.get_fingerprint_file() + if mysql_config: + patterns = system.get_fingerprint_file(args, programmatic) - for key, config in mysql_config.items(): - host = config.get('host') - user = config.get('user') - port = config.get('port', 3306) # default port for MySQL - password = config.get('password') - database = config.get('database') - limit_start = config.get('limit_start', 0) - limit_end = config.get('limit_end', 500) - tables = config.get('tables', []) + for key, config in mysql_config.items(): + host = config.get('host') + user = config.get('user') + port = config.get('port', 3306) # default port for MySQL + password = config.get('password') + database = config.get('database') + limit_start = config.get('limit_start', 0) + limit_end = config.get('limit_end', 500) + tables = config.get('tables', []) - if host and user and database: - system.print_info(f"Checking MySQL Profile {key} and database {database}") - conn = connect_mysql(host, port, user, password, database) - if conn: - results += check_data_patterns(conn, patterns, key, database, limit_start=limit_start, limit_end=limit_end, whitelisted_tables=tables) - conn.close() - else: - system.print_error(f"Incomplete MySQL configuration for key: {key}") + if host and user and database: + system.print_info(f"Checking MySQL Profile {key} and database {database}") + conn = connect_mysql(host, port, user, password, database) + if conn: + results += check_data_patterns(conn, patterns, key, database, limit_start=limit_start, limit_end=limit_end, whitelisted_tables=tables) + conn.close() + else: + system.print_error(f"Incomplete MySQL configuration for key: {key}") + else: + system.print_error("No MySQL connection details found in connection.yml") else: - system.print_error("No MySQL connection details found in connection.yml") - else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error("No 'sources' section found in connection.yml") + except Exception as e: + system.print_error(f"Failed to run MySQL checks with error: {e}") return results # Example usage diff --git a/hawk_scanner/commands/postgresql.py b/hawk_scanner/commands/postgresql.py index 282dc0e..17d837a 100644 --- a/hawk_scanner/commands/postgresql.py +++ b/hawk_scanner/commands/postgresql.py @@ -46,7 +46,7 @@ def check_data_patterns(conn, patterns, profile_name, database_name, limit_start for column, value in zip(columns, row): if value: value_str = str(value) - matches = system.match_strings(value_str) + matches = system.analyze_strings(value_str) if matches: for match in matches: results.append({ @@ -68,40 +68,43 @@ def check_data_patterns(conn, patterns, profile_name, database_name, limit_start cursor.close() return results -def execute(args): - results = [] - system.print_info(f"Running Checks for PostgreSQL Sources") - connections = system.get_connection() - - if 'sources' in connections: - sources_config = connections['sources'] - postgresql_config = sources_config.get('postgresql') - - if postgresql_config: - patterns = system.get_fingerprint_file() - - for key, config in postgresql_config.items(): - host = config.get('host') - user = config.get('user') - port = config.get('port', 5432) # default port for PostgreSQL - password = config.get('password') - database = config.get('database') - limit_start = config.get('limit_start', 0) - limit_end = config.get('limit_end', 500) - tables = config.get('tables', []) - - if host and user and password and database: - system.print_info(f"Checking PostgreSQL Profile {key}, database {database}") - conn = connect_postgresql(host, port, user, password, database) - if conn: - results += check_data_patterns(conn, patterns, key, database, limit_start=limit_start, limit_end=limit_end, whitelisted_tables=tables) - conn.close() - else: - system.print_error(f"Incomplete PostgreSQL configuration for key: {key}") +def execute(args, programmatic=False): + try: + results = [] + system.print_info(f"Running Checks for PostgreSQL Sources") + connections = system.get_connection(args, programmatic) + + if 'sources' in connections: + sources_config = connections['sources'] + postgresql_config = sources_config.get('postgresql') + + if postgresql_config: + patterns = system.get_fingerprint_file(args, programmatic) + + for key, config in postgresql_config.items(): + host = config.get('host') + user = config.get('user') + port = config.get('port', 5432) # default port for PostgreSQL + password = config.get('password') + database = config.get('database') + limit_start = config.get('limit_start', 0) + limit_end = config.get('limit_end', 500) + tables = config.get('tables', []) + + if host and user and password and database: + system.print_info(f"Checking PostgreSQL Profile {key}, database {database}") + conn = connect_postgresql(host, port, user, password, database) + if conn: + results += check_data_patterns(conn, patterns, key, database, limit_start=limit_start, limit_end=limit_end, whitelisted_tables=tables) + conn.close() + else: + system.print_error(f"Incomplete PostgreSQL configuration for key: {key}") + else: + system.print_error("No PostgreSQL connection details found in connection.yml") else: - system.print_error("No PostgreSQL connection details found in connection.yml") - else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error("No 'sources' section found in connection.yml") + except Exception as e: + system.print_error(f"Failed to run PostgreSQL checks with error: {e}") return results # Example usage diff --git a/hawk_scanner/commands/redis.py b/hawk_scanner/commands/redis.py index f153c68..af66816 100644 --- a/hawk_scanner/commands/redis.py +++ b/hawk_scanner/commands/redis.py @@ -44,30 +44,33 @@ def check_data_patterns(redis_instance, patterns, profile_name, host): }) return results -def execute(args): - results = [] - connections = system.get_connection() +def execute(args, programmatic=False): + try: + results = [] + connections = system.get_connection(args, programmatic) - if 'sources' in connections: - sources_config = connections['sources'] - redis_config = sources_config.get('redis') + if 'sources' in connections: + sources_config = connections['sources'] + redis_config = sources_config.get('redis') - if redis_config: - patterns = system.get_fingerprint_file() + if redis_config: + patterns = system.get_fingerprint_file(args, programmatic) - for profile_name, config in redis_config.items(): - host = config.get('host') - port = config.get('port', 6379) + for profile_name, config in redis_config.items(): + host = config.get('host') + port = config.get('port', 6379) - if host: - redis_instance = connect_redis(host, port) - if redis_instance: - results = check_data_patterns(redis_instance, patterns, profile_name, host) - redis_instance.close() - else: - system.print_error(f"Incomplete Redis configuration for key: {profile_name}") + if host: + redis_instance = connect_redis(host, port) + if redis_instance: + results = check_data_patterns(redis_instance, patterns, profile_name, host) + redis_instance.close() + else: + system.print_error(f"Incomplete Redis configuration for key: {profile_name}") + else: + system.print_error("No Redis connection details found in connection.yml") else: - system.print_error("No Redis connection details found in connection.yml") - else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error("No 'sources' section found in connection.yml") + except Exception as e: + system.print_error(f"Error: {e}") return results diff --git a/hawk_scanner/commands/s3.py b/hawk_scanner/commands/s3.py index bae88a2..85caf72 100644 --- a/hawk_scanner/commands/s3.py +++ b/hawk_scanner/commands/s3.py @@ -31,76 +31,78 @@ def get_patterns_from_file(file_path): patterns = yaml.safe_load(file) return patterns -def execute(args): - results = [] - shouldDownload = True - system.print_info(f"Running Checks for S3 Sources") - connections = system.get_connection() - - if 'sources' in connections: - sources_config = connections['sources'] - s3_config = sources_config.get('s3') +def execute(args, programmatic=False): + try: + results = [] + shouldDownload = True + system.print_info(f"Running Checks for S3 Sources") + connections = system.get_connection(args, programmatic) + if 'sources' in connections: + sources_config = connections['sources'] + s3_config = sources_config.get('s3') - if s3_config: - for key, config in s3_config.items(): - access_key = config.get('access_key') - secret_key = config.get('secret_key') - bucket_name = config.get('bucket_name') - exclude_patterns = config.get(key, {}).get('exclude_patterns', []) + if s3_config: + for key, config in s3_config.items(): + access_key = config.get('access_key') + secret_key = config.get('secret_key') + bucket_name = config.get('bucket_name') + exclude_patterns = config.get(key, {}).get('exclude_patterns', []) - system.print_info(f"Checking S3 profile: '{key}' with bucket '{bucket_name}'") - profile_name = key - if access_key and secret_key and bucket_name: - bucket = connect_s3(access_key, secret_key, bucket_name) - if bucket: + system.print_info(f"Checking S3 profile: '{key}' with bucket '{bucket_name}'") + profile_name = key + if access_key and secret_key and bucket_name: + bucket = connect_s3(access_key, secret_key, bucket_name) + if bucket: - for obj in bucket.objects.all(): - remote_etag = obj.e_tag.replace('"', '') - system.print_debug(f"Remote etag: {remote_etag}") - file_name = obj.key - if system.should_exclude_file(file_name, exclude_patterns): - continue + for obj in bucket.objects.all(): + remote_etag = obj.e_tag.replace('"', '') + system.print_debug(f"Remote etag: {remote_etag}") + file_name = obj.key + if system.should_exclude_file(file_name, exclude_patterns): + continue - file_path = f"data/s3/{remote_etag}-{file_name}" - os.makedirs(os.path.dirname(file_path), exist_ok=True) - if config.get("cache") == True: - if os.path.exists(file_path): - shouldDownload = False - local_etag = file_path.split('/')[-1].split('-')[0] - system.print_debug(f"Local etag: {local_etag}") - system.print_debug(f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml") - if remote_etag != local_etag: - system.print_debug(f"File in S3 bucket has changed, downloading it again...") - shouldDownload = True - else: + file_path = f"data/s3/{remote_etag}-{file_name}" + os.makedirs(os.path.dirname(file_path), exist_ok=True) + if config.get("cache") == True: + if os.path.exists(file_path): shouldDownload = False + local_etag = file_path.split('/')[-1].split('-')[0] + system.print_debug(f"Local etag: {local_etag}") + system.print_debug(f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml") + if remote_etag != local_etag: + system.print_debug(f"File in S3 bucket has changed, downloading it again...") + shouldDownload = True + else: + shouldDownload = False - if shouldDownload: - file_path = f"data/s3/{remote_etag}-{file_name}" - system.print_debug(f"Downloading file: {file_name} to {file_path}...") - bucket.download_file(file_name, file_path) - - matches = system.read_match_strings(file_path, 'google_cloud_storage') - if matches: - for match in matches: - results.append({ - 'bucket': bucket_name, - 'file_path': file_name, - 'pattern_name': match['pattern_name'], - 'matches': match['matches'], - 'sample_text': match['sample_text'], - 'profile': key, - 'data_source': 's3' - }) + if shouldDownload: + file_path = f"data/s3/{remote_etag}-{file_name}" + system.print_debug(f"Downloading file: {file_name} to {file_path}...") + bucket.download_file(file_name, file_path) + + matches = system.analyze_strings(file_path, 'google_cloud_storage') + if matches: + for match in matches: + results.append({ + 'bucket': bucket_name, + 'file_path': file_name, + 'pattern_name': match['pattern_name'], + 'matches': match['matches'], + 'sample_text': match['sample_text'], + 'profile': key, + 'data_source': 's3' + }) + else: + system.print_error(f"Failed to connect to S3 bucket: {bucket_name}") else: - system.print_error(f"Failed to connect to S3 bucket: {bucket_name}") - else: - system.print_error(f"Incomplete S3 configuration for key: {key}") + system.print_error(f"Incomplete S3 configuration for key: {key}") + else: + system.print_error("No S3 connection details found in connection.yml") else: - system.print_error("No S3 connection details found in connection.yml") - else: - system.print_error("No 'sources' section found in connection.yml") - if config.get("cache") == False: - os.system("rm -rf data/s3") + system.print_error("No 'sources' section found in connection.yml") + if config.get("cache") == False: + os.system("rm -rf data/s3") + except Exception as e: + system.print_error(f"Error running S3 checks: {e}") return results diff --git a/hawk_scanner/commands/slack.py b/hawk_scanner/commands/slack.py index a2e09e8..a12cc2c 100644 --- a/hawk_scanner/commands/slack.py +++ b/hawk_scanner/commands/slack.py @@ -67,35 +67,37 @@ def check_slack_messages(client, patterns, profile_name, channel_types, channel_ system.print_error(f"Failed to fetch messages from Slack with error: {e.response['error']}") return results -def execute(args): - results = [] - system.print_info("Running Checks for Slack Sources") - connections = system.get_connection() +def execute(args, programmatic=False): + try: + results = [] + system.print_info("Running Checks for Slack Sources") + connections = system.get_connection(args, programmatic) - if 'sources' in connections: - sources_config = connections['sources'] - slack_config = sources_config.get('slack') + if 'sources' in connections: + sources_config = connections['sources'] + slack_config = sources_config.get('slack') - if slack_config: - patterns = system.get_fingerprint_file() + if slack_config: + patterns = system.get_fingerprint_file(args, programmatic) - for key, config in slack_config.items(): - token = config.get('token') - channel_types = config.get('channel_types', "public_channel,private_channel") - channel_names = config.get('channel_names', None) + for key, config in slack_config.items(): + token = config.get('token') + channel_types = config.get('channel_types', "public_channel,private_channel") + channel_names = config.get('channel_names', None) - if token: - system.print_info(f"Checking Slack Profile {key}") - else: - system.print_error(f"Incomplete Slack configuration for key: {key}") - continue + if token: + system.print_info(f"Checking Slack Profile {key}") + else: + system.print_error(f"Incomplete Slack configuration for key: {key}") + continue - client = connect_slack(token) - if client: - results += check_slack_messages(client, patterns, key, channel_types, channel_names) + client = connect_slack(token) + if client: + results += check_slack_messages(client, patterns, key, channel_types, channel_names) + else: + system.print_error("No Slack connection details found in connection.yml") else: - system.print_error("No Slack connection details found in connection.yml") - else: - system.print_error("No 'sources' section found in connection.yml") - + system.print_error("No 'sources' section found in connection.yml") + except Exception as e: + system.print_error(f"Failed to run Slack checks with error: {e}") return results \ No newline at end of file diff --git a/hawk_scanner/commands/text.py b/hawk_scanner/commands/text.py index 8781c83..cdd6ec8 100644 --- a/hawk_scanner/commands/text.py +++ b/hawk_scanner/commands/text.py @@ -5,7 +5,7 @@ def check_data_patterns(value, patterns, profile_name): value_str = str(value) - matches = system.match_strings(value_str) + matches = system.analyze_strings(value_str) results = [] if matches: for match in matches: @@ -19,16 +19,14 @@ def check_data_patterns(value, patterns, profile_name): return results def execute(args, programmatic=False): - results = [] - system.print_info(f"Running Checks for Simple text") - connections = system.get_connection() - patterns = system.get_fingerprint_file() - - if not programmatic: + try: + results = [] + system.print_info(f"Running Checks for Simple text") + connections = system.get_connection(args, programmatic) + patterns = system.get_fingerprint_file(args, programmatic) if 'sources' in connections: sources_config = connections['sources'] text_config = sources_config.get('text') - if text_config: for key, config in text_config.items(): text = config.get('text', None) @@ -37,9 +35,8 @@ def execute(args, programmatic=False): system.print_error("No text connection details found in connection.yml") else: system.print_error("No 'sources' section found in connection.yml") - else: - text = args.get('text', None) - results += check_data_patterns(text, patterns, 'text') + except Exception as e: + system.print_error(f"Failed to connect to text with error: {e}") return results # Example usage diff --git a/hawk_scanner/internals/system.py b/hawk_scanner/internals/system.py index 0b9ab15..b203f7b 100644 --- a/hawk_scanner/internals/system.py +++ b/hawk_scanner/internals/system.py @@ -15,18 +15,16 @@ # Create a TinyDB instance for storing previous alert hashes db = TinyDB('previous_alerts.json') +console = Console() parser = argparse.ArgumentParser(description='🦅 A powerful scanner to scan your Filesystem, S3, MySQL, PostgreSQL, MongoDB, Redis, Google Cloud Storage and Firebase storage for PII and sensitive data.') parser.add_argument('--connection', action='store', help='YAML Connection file path') parser.add_argument('--fingerprint', action='store', help='Override YAML fingerprint file path') parser.add_argument('--debug', action='store_true', help='Enable debug mode') +parser.add_argument('--programmatic', action='store_true', help='Enable programmatic mode') parser.add_argument('--shutup', action='store_true', help='Suppress the Hawk Eye banner 🫣', default=False) -args, extra_args = parser.parse_known_args() - - -console = Console() - +args, extra = parser.parse_known_args() def calculate_msg_hash(msg): return hashlib.sha256(msg.encode()).hexdigest() @@ -106,20 +104,31 @@ def RedactData(input_string): return redacted_string -def get_connection(): - if args.connection: - if os.path.exists(args.connection): - with open(args.connection, 'r') as file: - connections = yaml.safe_load(file) - return connections +def get_connection(args=None, programmatic=False): + try: + if args.connection and not programmatic: + print("ok 1") + if os.path.exists(args.connection): + print("ok 2") + with open(args.connection, 'r') as file: + connections = yaml.safe_load(file) + return connections + else: + print_error(f"Connection file not found: {args.connection}") + exit(1) + elif programmatic and args.connection: + return args.connection else: - print_error(f"Connection file not found: {args.connection}") + print_error(f"Please provide a connection file using --connection flag") exit(1) - else: - print_error(f"Please provide a connection file using --connection flag") + except Exception as e: + print_error(f"Unable to load connection file: {e}") exit(1) -def get_fingerprint_file(): +def get_fingerprint_file(args, programmatic=False): + print_info(f"Programmatic: {programmatic}") + if args.programmatic: + programmatic = True if args.fingerprint: if os.path.exists(args.fingerprint): with open(args.fingerprint, 'r') as file: @@ -127,8 +136,8 @@ def get_fingerprint_file(): else: print_error(f"Fingerprint file not found: {args.fingerprint}") exit(1) - else: - file_path = "https://github.com/rohitcoder/hawk-eye/raw/main/fingerprint.yml" + elif not programmatic: + file_path = "https://github.com/rohitcoderdss/hawk-eye/raw/main/fingerprint.yml" try: response = requests.get(file_path, timeout=10) print_info(f"Downloading default fingerprint.yml from {file_path}") @@ -140,10 +149,18 @@ def get_fingerprint_file(): print_error(f"Unable to download default fingerprint.yml please provide your own fingerprint file using --fingerprint flag") exit(1) except Exception as e: - print_error(f"Unable to download default fingerprint.yml please provide your own fingerprint file using --fingerprint flag") + print_error(f"Unable to download default fingerprint.yml please provide your own fingerprint file using --fingerprint flag: "+str(e)) + exit(1) + else: + print_debug(f"Using fingerprint file: {args.fingerprint} for programmatic execution") + if os.path.exists(args.fingerprint): + with open(args.fingerprint, 'r') as file: + return yaml.safe_load(file) + else: + print_error(f"Fingerprint file not found: {args.fingerprint} for programmatic execution") exit(1) -patterns = get_fingerprint_file() +patterns = get_fingerprint_file(args) def print_banner(): banner = r""" @@ -183,11 +200,12 @@ def print_banner(): if not args.shutup: console.print(banner) -connections = get_connection() +connections = get_connection(args) +patterns = get_fingerprint_file(args) -def match_strings(content): +def analyze_strings(content, connections=connections, patterns=patterns, programmatic=False): matched_strings = [] - + print_debug(f"Connections: {connections}") if 'notify' in connections: redacted: bool = connections.get('notify', {}).get('redacted', False) else: @@ -199,7 +217,6 @@ def match_strings(content): ## parse pattern_regex as Regex complied_regex = re.compile(pattern_regex, re.IGNORECASE) print_debug(f"Regex: {complied_regex}") - print_debug(f"Content: {content}") matches = re.findall(complied_regex, content) print_debug(f"Matches: {matches}") if matches: @@ -250,18 +267,12 @@ def list_all_files_iteratively(path, exclude_patterns): if not should_exclude_file(file, exclude_patterns): yield os.path.join(root, file) -def read_match_strings(file_path, source): +def analyze_file(file_path, source, connections=None, patterns=None, programmatic=False): print_info(f"Scanning file: {file_path}") content = '' - try: - # Check if the file is an image - print(file_path) if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')): - print("ocr started for "+file_path) content = enhance_and_ocr(file_path) - print("texts") - print(content) # Check if the file is a PDF document elif file_path.lower().endswith('.pdf'): content = read_pdf(file_path) @@ -277,10 +288,10 @@ def read_match_strings(file_path, source): # Attempt to decode using UTF-8, fallback to 'latin-1' if needed content = file.read().decode('utf-8', errors='replace') except Exception as e: - print_debug(f"Error in read_match_strings: {e}") + print_debug(f"Error in analyze_file: {e}") pass - - matched_strings = match_strings(content) + matched_strings = analyze_strings(content, connections, patterns, programmatic=programmatic) + print_debug(f"Matched strings: {matched_strings}") return matched_strings def read_pdf(file_path): @@ -347,7 +358,7 @@ def read_archive(file_path): for root, dirs, files in os.walk(tmp_dir): for file in files: file_path = os.path.join(root, file) - content += read_match_strings(file_path, 'archive') # Recursively read content + content += analyze_file(file_path, 'archive') # Recursively read content # Clean up the temporary directory shutil.rmtree(tmp_dir) diff --git a/hawk_scanner/main.py b/hawk_scanner/main.py index 029bcfc..ba1adc4 100644 --- a/hawk_scanner/main.py +++ b/hawk_scanner/main.py @@ -22,6 +22,16 @@ def clear_screen(): ## Now separate the results by data_source data_sources = ['s3', 'mysql', 'redis', 'firebase', 'gcs', 'fs', 'postgresql', 'mongodb', 'slack', 'couchdb', 'gdrive', 'gdrive_workspace', 'text'] +data_sources_option = ['all'] + data_sources +parser = argparse.ArgumentParser(description='🦅 A powerful scanner to scan your Filesystem, S3, MySQL, PostgreSQL, MongoDB, Redis, Google Cloud Storage and Firebase storage for PII and sensitive data.') +parser.add_argument('command', nargs='?', choices=data_sources_option, help='Command to execute') +parser.add_argument('--json', help='Save output to json file') +parser.add_argument('--debug', action='store_true', help='Enable debug mode') +parser.add_argument('--connection', action='store', help='YAML Connection file path') +parser.add_argument('--fingerprint', action='store', help='Override YAML fingerprint file path') +parser.add_argument('--shutup', action='store_true', help='Suppress the Hawk Eye banner 🫣', default=False) + +args = parser.parse_args() def load_command_module(command): try: @@ -42,22 +52,15 @@ def execute_command(command, args): def main(): - data_sources_option = ['all'] + data_sources - parser = argparse.ArgumentParser(description='CLI Command Executor') - parser.add_argument('command', nargs='?', choices=data_sources_option, help='Command to execute') - parser.add_argument('--json', help='Save output to json file') - parser.add_argument('--debug', action='store_true', help='Enable debug mode') - - args, extra_args = parser.parse_known_args() results = [] if args.command: if args.command == 'all': commands = data_sources for command in commands: - for data in execute_command(command, extra_args): + for data in execute_command(command, args): results.append(data) else: - for data in execute_command(args.command, extra_args): + for data in execute_command(args.command, args): results.append(data) else: system.print_error("Please provide a command to execute") diff --git a/setup.py b/setup.py index 3e9c387..b6c9f17 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -VERSION = "0.3.8" +VERSION = "0.3.9" from setuptools import setup, find_packages