From 96b42b9ece61ce236751faf8fcea2c733ca363fd Mon Sep 17 00:00:00 2001
From: Rohit kumar <rohittourister@gmail.com>
Date: Wed, 6 Dec 2023 14:53:01 +0530
Subject: [PATCH] Added google drive module

---
 client_secrets.json                 |   1 +
 connection.yml.sample               |   9 ++
 hawk_scanner/commands/firebase.py   |   2 +-
 hawk_scanner/commands/gdrive.py     | 142 ++++++++++++++++++++++++++++
 hawk_scanner/commands/mongodb.py    |  22 ++++-
 hawk_scanner/commands/mysql.py      |  11 ++-
 hawk_scanner/commands/postgresql.py |   9 +-
 hawk_scanner/commands/redis.py      |   2 +-
 hawk_scanner/commands/s3.py         |   2 +-
 hawk_scanner/internals/system.py    |  10 +-
 hawk_scanner/main.py                |  30 +++++-
 readme.md                           |  17 +++-
 requirements.txt                    |   8 +-
 13 files changed, 245 insertions(+), 20 deletions(-)
 create mode 100644 client_secrets.json
 create mode 100644 hawk_scanner/commands/gdrive.py

diff --git a/client_secrets.json b/client_secrets.json
new file mode 100644
index 0000000..68909a4
--- /dev/null
+++ b/client_secrets.json
@@ -0,0 +1 @@
+{"installed":{"access_token": "ya29.a0AfB_byA-Yfji97wpOU2iRgVcj1RjE4BvL0kopBCfsRj-kM_BCQLPM9djAO6UVPUUrAmIPw50IeGAvekeYB8SleBTrEwh1QVfcq47yYe21x_C5Bkab18eXWMoRRLoq3_MSaP9DZtpHzqn3iqIELA1gY--ZmEE1lu5ne42aCgYKAfcSAQ8SFQHGX2MidZit_1934RB6prLVmy-EqQ0171", "client_id": "803709923896-ovj0pu9v0qah9b1isr8bo4rgcga0bthh.apps.googleusercontent.com", "client_secret": "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", "refresh_token": null, "token_expiry": "2023-12-06T08:13:27Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0AfB_byA-Yfji97wpOU2iRgVcj1RjE4BvL0kopBCfsRj-kM_BCQLPM9djAO6UVPUUrAmIPw50IeGAvekeYB8SleBTrEwh1QVfcq47yYe21x_C5Bkab18eXWMoRRLoq3_MSaP9DZtpHzqn3iqIELA1gY--ZmEE1lu5ne42aCgYKAfcSAQ8SFQHGX2MidZit_1934RB6prLVmy-EqQ0171", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client","redirect_uris":["urn:ietf:wg:oauth:2.0:oob","http://localhost"],"auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://oauth2.googleapis.com/token"}}
\ No newline at end of file
diff --git a/connection.yml.sample b/connection.yml.sample
index 02a00b9..97a5d64 100644
--- a/connection.yml.sample
+++ b/connection.yml.sample
@@ -85,3 +85,12 @@ sources:
       # channel_names:
       #   - general
       #   - random
+  
+  gdrive:
+    drive_example:
+      folder_name:
+      credentials_file: /Users/kumarohit/Downloads/client_secret.json
+      cache: true
+      exclude_patterns:
+        - .pdf
+        - .docx
diff --git a/hawk_scanner/commands/firebase.py b/hawk_scanner/commands/firebase.py
index 2371681..64553c7 100644
--- a/hawk_scanner/commands/firebase.py
+++ b/hawk_scanner/commands/firebase.py
@@ -80,7 +80,7 @@ def execute(args):
                 else:
                     system.print_error(f"Incomplete Firebase configuration for key: {key}")
         else:
-            system.print_error("No Firebase connection details found in connection.yml")
+            system.print_error("No Firebase connection details found in connection file")
     else:
         system.print_error("No 'sources' section found in connection.yml")
     
diff --git a/hawk_scanner/commands/gdrive.py b/hawk_scanner/commands/gdrive.py
new file mode 100644
index 0000000..78591b1
--- /dev/null
+++ b/hawk_scanner/commands/gdrive.py
@@ -0,0 +1,142 @@
+import os, json
+from pydrive2.auth import GoogleAuth
+from pydrive2.drive import GoogleDrive
+from rich.console import Console
+from hawk_scanner.internals import system
+from pydrive2.fs import GDriveFileSystem
+
+def connect_google_drive(credentials_file):
+    credentials = open(credentials_file, 'r').read()
+    credentials = json.loads(credentials)
+    client_id = credentials['client_id']
+    client_secret = credentials['client_secret']
+
+    try:
+        fs = GDriveFileSystem("root", client_id=client_id, client_secret=client_secret, token=credentials_file)
+        system.print_debug("Connected to Google Drive")
+        drive = fs.client
+        return drive
+    except Exception as e:
+        print(f"Failed to connect to Google Drive: {e}")
+    os.system("rm -rf client_secrets.json")
+
+def download_file(drive, file_obj, base_path):
+    try:
+        file_name = file_obj['title']
+        file_id = file_obj['id']
+
+        # Get the parent folder IDs of the file
+        parent_folder_ids = file_obj['parents']
+
+        # Initialize folder_path with the base_path
+        folder_path = base_path
+
+        # If the file has parent folders, construct the destination path
+        if parent_folder_ids:
+            for parent_id in parent_folder_ids:
+                parent_folder = drive.CreateFile({'id': parent_id['id']})
+                if parent_folder['title'] == 'My Drive':
+                    continue
+                folder_path = os.path.join(folder_path, parent_folder['title'])
+
+        file_path = os.path.join(folder_path, file_name)
+
+        if file_obj['mimeType'] == 'application/vnd.google-apps.folder':
+            if not os.path.exists(file_path):
+                os.makedirs(file_path)
+            folder_files = drive.ListFile({'q': f"'{file_id}' in parents"}).GetList()
+            for folder_file in folder_files:
+                download_file(drive, folder_file, folder_path)
+        else:
+            file_obj.GetContentFile(file_path)
+
+        system.print_debug(f"File downloaded to: {file_path}")
+    except Exception as e:
+        print(f"Failed to download file: {e}")
+
+def list_files(drive, folder_name=None):
+    try:
+        file_list = drive.ListFile({'q': f"'root' in parents and title='{folder_name}'"}).GetList() if folder_name else drive.ListFile().GetList()
+        return file_list
+    except Exception as e:
+        print(f"Error listing files: {e}")
+        return []
+
+def execute(args):
+    results = []
+    should_download = True
+    connections = system.get_connection()
+    is_cache_enabled = False
+    drive_config = None
+
+    if 'sources' in connections:
+        sources_config = connections['sources']
+        drive_config = sources_config.get('gdrive')
+    else:
+        system.print_error("No 'sources' section found in connection.yml")
+
+    if drive_config:
+        for key, config in drive_config.items():
+            credentials_file = config.get('credentials_file')
+            folder_name = config.get('folder_name')
+            exclude_patterns = config.get(key, {}).get('exclude_patterns', [])
+            is_cache_enabled = config.get('cache', False)
+            drive = connect_google_drive(credentials_file)
+            if drive:
+                files = list_files(drive, folder_name=folder_name)
+                for file_obj in files:
+                    download_file(drive, file_obj, "data/google_drive")
+                    file_id = file_obj['id']
+                    file_name = file_obj['title']
+                    if file_obj['mimeType'] == 'application/vnd.google-apps.folder':
+                        continue
+
+                    # Construct file_path with the correct folder structure
+                    parent_folder_ids = file_obj['parents']
+                    folder_path = "data/google_drive"
+                    if parent_folder_ids:
+                        for parent_id in parent_folder_ids:
+                            parent_folder = drive.CreateFile({'id': parent_id['id']})
+                            if parent_folder['title'] == 'My Drive':
+                                continue
+                            folder_path = os.path.join(folder_path, parent_folder['title'])
+
+                    file_path = os.path.join(folder_path, file_name)
+
+                    if system.should_exclude_file(file_name, exclude_patterns):
+                        continue
+
+                    if config.get("cache") and os.path.exists(file_path):
+                        should_download = False
+                        system.print_debug(f"File already exists in cache, using it.")
+                    else:
+                        should_download = True
+
+                    if should_download:
+                        download_file(drive, file_obj, "data/google_drive")
+
+                    matches = system.read_match_strings(file_path, 'gdrive')
+                    if matches:
+                        for match in matches:
+                            results.append({
+                                'file_id': file_id,
+                                'file_name': file_name,
+                                'file_path': file_path,
+                                'pattern_name': match['pattern_name'],
+                                'matches': match['matches'],
+                                'sample_text': match['sample_text'],
+                                'profile': key,
+                                'data_source': 'gdrive'
+                            })
+            else:
+                system.print_error("Failed to connect to Google Drive")
+    else:
+        system.print_error("No Google Drive connection details found in connection file")
+
+    if not is_cache_enabled:
+        os.system("rm -rf data/google_drive")
+
+    return results
+
+# Call the execute function with the necessary arguments
+# execute(your_args)
diff --git a/hawk_scanner/commands/mongodb.py b/hawk_scanner/commands/mongodb.py
index 157a269..7ce6359 100644
--- a/hawk_scanner/commands/mongodb.py
+++ b/hawk_scanner/commands/mongodb.py
@@ -23,9 +23,20 @@ def connect_mongodb(host, port, username, password, database, uri=None):
         return None
 
 
-def check_data_patterns(db, patterns, profile_name, database_name, limit_start=0, limit_end=500):
+def check_data_patterns(db, patterns, profile_name, database_name, limit_start=0, limit_end=500, whitelisted_collections=None):
     results = []
-    for collection_name in db.list_collection_names():
+    all_collections = db.list_collection_names()
+
+    if whitelisted_collections:
+        collections_to_scan = [collection for collection in all_collections if collection in whitelisted_collections]
+    else:
+        collections_to_scan = all_collections or []
+
+    for collection_name in collections_to_scan:
+        if collection_name not in all_collections:
+            system.print_warning(f"Collection {collection_name} not found in the database. Skipping.")
+            continue
+
         collection = db[collection_name]
         for document in collection.find().limit(limit_end).skip(limit_start):
             for field_name, field_value in document.items():
@@ -69,6 +80,7 @@ def execute(args):
                 uri = config.get('uri')  # Added support for URI
                 limit_start = config.get('limit_start', 0)
                 limit_end = config.get('limit_end', 500)
+                collections = config.get('collections', [])
 
                 if uri:
                     system.print_info(f"Checking MongoDB Profile {key} using URI")
@@ -80,9 +92,13 @@ def execute(args):
 
                 db = connect_mongodb(host, port, username, password, database, uri)
                 if db:
-                    results += check_data_patterns(db, patterns, key, database, limit_start=limit_start, limit_end=limit_end)
+                    results += check_data_patterns(db, patterns, key, database, limit_start=limit_start, limit_end=limit_end, whitelisted_collections=collections)
         else:
             system.print_error("No MongoDB connection details found in connection.yml")
     else:
         system.print_error("No 'sources' section found in connection.yml")
     return results
+
+# Example usage
+if __name__ == "__main__":
+    execute(None)
diff --git a/hawk_scanner/commands/mysql.py b/hawk_scanner/commands/mysql.py
index 41e6a36..2747dc8 100644
--- a/hawk_scanner/commands/mysql.py
+++ b/hawk_scanner/commands/mysql.py
@@ -19,13 +19,16 @@ def connect_mysql(host, port, user, password, database):
     except Exception as e:
         system.print_error(f"Failed to connect to MySQL database at {host} with error: {e}")
 
-def check_data_patterns(conn, patterns, profile_name, database_name, limit_start=0, limit_end=500, tables=None):
+def check_data_patterns(conn, patterns, profile_name, database_name, limit_start=0, limit_end=500, whitelisted_tables=None):
     cursor = conn.cursor()
     
     # Get the list of tables to scan
     cursor.execute("SHOW TABLES")
     tables = [table[0] for table in cursor.fetchall()]
-    tables_to_scan = tables or []  # Use all tables if tables[] is blank or not provided
+    if whitelisted_tables:
+        tables_to_scan = [table for table in tables if table in whitelisted_tables]
+    else:
+        tables_to_scan = tables or []
 
     table_count = 1
 
@@ -83,11 +86,11 @@ def execute(args):
                 limit_end = config.get('limit_end', 500)
                 tables = config.get('tables', [])
 
-                if host and user and password and database:
+                if host and user and database:
                     system.print_info(f"Checking MySQL Profile {key} and database {database}")
                     conn = connect_mysql(host, port, user, password, database)
                     if conn:
-                        results += check_data_patterns(conn, patterns, key, database, limit_start=limit_start, limit_end=limit_end, tables=tables)
+                        results += check_data_patterns(conn, patterns, key, database, limit_start=limit_start, limit_end=limit_end, whitelisted_tables=tables)
                         conn.close()
                 else:
                     system.print_error(f"Incomplete MySQL configuration for key: {key}")
diff --git a/hawk_scanner/commands/postgresql.py b/hawk_scanner/commands/postgresql.py
index ffb6274..282dc0e 100644
--- a/hawk_scanner/commands/postgresql.py
+++ b/hawk_scanner/commands/postgresql.py
@@ -19,13 +19,16 @@ def connect_postgresql(host, port, user, password, database):
     except Exception as e:
         system.print_error(f"Failed to connect to PostgreSQL database at {host} with error: {e}")
 
-def check_data_patterns(conn, patterns, profile_name, database_name, limit_start=0, limit_end=500, tables=None):
+def check_data_patterns(conn, patterns, profile_name, database_name, limit_start=0, limit_end=500, whitelisted_tables=None):
     cursor = conn.cursor()
     
     # Get the list of tables to scan
     cursor.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'")
     all_tables = [table[0] for table in cursor.fetchall()]
-    tables_to_scan = tables or all_tables  # Use all tables if tables[] is blank or not provided
+    if whitelisted_tables:
+        tables_to_scan = [table for table in all_tables if table in whitelisted_tables]
+    else:
+        tables_to_scan = all_tables or []
 
     table_count = 1
 
@@ -91,7 +94,7 @@ def execute(args):
                     system.print_info(f"Checking PostgreSQL Profile {key}, database {database}")
                     conn = connect_postgresql(host, port, user, password, database)
                     if conn:
-                        results += check_data_patterns(conn, patterns, key, database, limit_start=limit_start, limit_end=limit_end, tables=tables)
+                        results += check_data_patterns(conn, patterns, key, database, limit_start=limit_start, limit_end=limit_end, whitelisted_tables=tables)
                         conn.close()
                 else:
                     system.print_error(f"Incomplete PostgreSQL configuration for key: {key}")
diff --git a/hawk_scanner/commands/redis.py b/hawk_scanner/commands/redis.py
index f5b6d2d..f153c68 100644
--- a/hawk_scanner/commands/redis.py
+++ b/hawk_scanner/commands/redis.py
@@ -18,7 +18,7 @@ def connect_redis(host, port):
         system.print_error(f"Redis instance at {host}:{port} is not accessible with error: {e}")
 
 def get_patterns_from_file(file_path):
-    with open(file_path, 'r') as file:
+    with open(file_path, 'r', encoding='utf-8') as file:
         patterns = yaml.safe_load(file)
         return patterns
 
diff --git a/hawk_scanner/commands/s3.py b/hawk_scanner/commands/s3.py
index 8cb239b..bae88a2 100644
--- a/hawk_scanner/commands/s3.py
+++ b/hawk_scanner/commands/s3.py
@@ -27,7 +27,7 @@ def get_last_update_time(obj):
     return None
 
 def get_patterns_from_file(file_path):
-    with open(file_path, 'r') as file:
+    with open(file_path, 'r', encoding='utf-8') as file:
         patterns = yaml.safe_load(file)
         return patterns
 
diff --git a/hawk_scanner/internals/system.py b/hawk_scanner/internals/system.py
index df9cbd0..0df828b 100644
--- a/hawk_scanner/internals/system.py
+++ b/hawk_scanner/internals/system.py
@@ -31,11 +31,14 @@ def calculate_msg_hash(msg):
     return hashlib.sha256(msg.encode()).hexdigest()
 
 def print_info(message):
-    console.print(f"[yellow][INFO][/yellow] {message}")
+    console.print(f"[yellow][INFO][/yellow] {str(message)}")
 
 def print_debug(message):
     if args.debug:
-        console.print(f"[blue][DEBUG][/blue] {message}")
+        try:
+            console.print(f"[blue][DEBUG][/blue] {str(message)}")
+        except Exception as e:
+            pass
 
 def print_error(message):
     console.print(f"[bold red]❌ {message}")
@@ -395,8 +398,7 @@ def SlackNotify(msg):
         
         slack_config = notify_config.get('slack', {})
         webhook_url = slack_config.get('webhook_url', '')
-        
-        if webhook_url != '':
+        if webhook_url and webhook_url.startswith('https://hooks.slack.com/services/'):
             try:
                 payload = {
                     'text': msg,
diff --git a/hawk_scanner/main.py b/hawk_scanner/main.py
index 7444dc8..65c7eb4 100644
--- a/hawk_scanner/main.py
+++ b/hawk_scanner/main.py
@@ -21,7 +21,7 @@ def clear_screen():
 console = Console()
 
 ## Now separate the results by data_source
-data_sources = ['s3', 'mysql', 'redis', 'firebase', 'gcs', 'fs', 'postgresql', 'mongodb', 'slack', 'couchdb']
+data_sources = ['s3', 'mysql', 'redis', 'firebase', 'gcs', 'fs', 'postgresql', 'mongodb', 'slack', 'couchdb', 'gdrive']
 
 def load_command_module(command):
     try:
@@ -99,6 +99,8 @@ def main():
             table.add_column("Channel Name > Message Link")
         elif group == 'couchdb':
             table.add_column("Host > Database > Document ID > Field")
+        elif group == 'gdrive':
+            table.add_column("File Name")
 
         table.add_column("Pattern Name")
         table.add_column("Total Exposed")
@@ -390,6 +392,32 @@ def main():
                     exposed_values=records_mini
                 )
                 
+                system.SlackNotify(AlertMsg)
+            elif group == 'gdrive':
+                table.add_row(
+                    str(i),
+                    result['profile'],
+                    f"{result['file_name']}",
+                    result['pattern_name'],
+                    str(len(result['matches'])),
+                    records_mini,
+                    result['sample_text'],
+                )
+                AlertMsg = """
+                *** PII Or Secret Found ***
+                Data Source: Google Drive - {vulnerable_profile}
+                File Name: {file_name}
+                Pattern Name: {pattern_name}
+                Total Exposed: {total_exposed}
+                Exposed Values: {exposed_values}
+                """.format(
+                    vulnerable_profile=result['profile'],
+                    file_name=result['file_name'],
+                    pattern_name=result['pattern_name'],
+                    total_exposed=str(len(result['matches'])),
+                    exposed_values=records_mini
+                )
+                
                 system.SlackNotify(AlertMsg)
             else:
                 # Handle other cases or do nothing for unsupported groups
diff --git a/readme.md b/readme.md
index a997d62..531819c 100644
--- a/readme.md
+++ b/readme.md
@@ -17,7 +17,7 @@
 
 ### 🦅 HAWK Eye - Highly Advanced Watchful Keeper Eye
 
-HAWK Eye is a powerful and versatile CLI (Command-Line Interface) tool designed to be your vigilant watchkeeper, guarding against potential data breaches and cyber threats across various platforms. Inspired by the precision and vision of majestic birds of prey, HAWK Eye swiftly scans multiple data sources, including S3, MySQL, PostgreSQL, MongoDB, CouchDB, Slack, Redis, Firebase, filesystem, Slack, and Google Cloud buckets (GCS), for Personally Identifiable Information (PII) and secrets. It uses text analysis and OCR techniques to go throug most of the documents, database and different file types like  docx, xlsx, pptx, pdf, jpg, png, gif, zip, tar, rar, etc.
+HAWK Eye is a powerful and versatile CLI (Command-Line Interface) tool designed to be your vigilant watchkeeper, guarding against potential data breaches and cyber threats across various platforms. Inspired by the precision and vision of majestic birds of prey, HAWK Eye swiftly scans multiple data sources, including S3, MySQL, PostgreSQL, MongoDB, CouchDB, Google Drive, Slack, Redis, Firebase, filesystem, and Google Cloud buckets (GCS), for Personally Identifiable Information (PII) and secrets. It uses text analysis and OCR techniques to go throug most of the documents, database and different file types like  docx, xlsx, pptx, pdf, jpg, png, gif, zip, tar, rar, etc.
 
 
 ### Why "HAWK Eye"?
@@ -154,6 +154,12 @@ Note: If you don't provide any command, it will run all commands (firebase, fs,
           </td>
          <td>Scan S3 profiles for PII and secrets data.</td>
       </tr>
+      <tr>
+         <td>
+            gdrive
+          </td>
+         <td>Scan Google drive profiles for PII and secrets data.</td>
+      </tr>
       <tr>
          <td>--connection</td>
          <td>Provide a connection YAML local file path like --connection connection.yml, this file will contain all creds and configs for different sources and other configurations.</td>
@@ -266,6 +272,15 @@ sources:
         - private
         - venv
         - node_modules
+  
+  gdrive:
+    drive_example:
+      folder_name:
+      credentials_file: /Users/kumarohit/Downloads/client_secret.json
+      cache: true
+      exclude_patterns:
+        - .pdf
+        - .docx
 
   slack:
     slack_example:
diff --git a/requirements.txt b/requirements.txt
index 5fc4dd0..35aee8c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ boto3
 PyYAML
 rich
 mysql-connector-python
+pymysql
 redis
 firebase-admin
 slack-sdk
@@ -14,4 +15,9 @@ Pillow
 python-docx
 openpyxl
 PyPDF2
-patool
\ No newline at end of file
+patool
+pydrive2
+appdirs
+tqdm
+funcy
+fsspec
\ No newline at end of file