Vignesh's scripts

purs3lab · Apr 26, 2024 · 4bb4cc8 · 4bb4cc8
1 parent 63b503a
commit 4bb4cc8
Show file tree

Hide file tree

Showing 3 changed files with 203 additions and 0 deletions.
diff --git a/yaml_runner/github_workflow_analysis.py b/yaml_runner/github_workflow_analysis.py
@@ -0,0 +1,86 @@
+import requests
+import pandas as pd
+from github import Github
+
+from bs4 import BeautifulSoup
+
+def convert_to_number(s):
+    # Define a dictionary for suffix multipliers
+    multipliers = {'k': 1000, 'm': 1000000, 'b': 1000000000}
+
+    # Check if the last character is a digit
+    if s[-1].isdigit():
+        return int(s)
+
+    # Extract the numeric part and the suffix
+    numeric_part, suffix = s[:-1], s[-1].lower()
+
+    # Convert the numeric part to a float
+    numeric_value = float(numeric_part)
+
+    # Multiply the numeric value by the corresponding multiplier
+    if suffix in multipliers:
+        return int(numeric_value * multipliers[suffix])
+    else:
+        raise ValueError(f"Unknown suffix '{suffix}' in input.")
+
+def get_star_count(repo_url):
+    try:
+        response = requests.get(repo_url)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.content, 'html.parser')
+
+        # Find the <a> element that contains the '/stargazers' in its href attribute
+        # and is closest to the actual star count display
+        star_element = soup.find('a', href=lambda href: href and 'stargazers' in href)
+
+        # Assuming the star count is the next sibling or closely following the found element,
+        # often the actual star count might be in a span or directly within the <a> tag but needs to be visible
+        # so we look for the parent and then for the element that contains the star count if not directly found.
+        if star_element:
+            star_count_text = star_element.get_text(strip=True)
+            if not star_count_text.isdigit():
+                # If the direct text is not a digit, try finding a span or sibling that contains digits
+                star_count_span = star_element.find_next_sibling('span')
+                if star_count_span:
+                    star_count_text = star_count_span.get_text(strip=True)
+
+            # Convert the star count text to an integer
+            star_count_text = star_count_text.split('star')[0].strip()
+            star_count_text = convert_to_number(star_count_text)
+            star_count = int(star_count_text)
+            print(f"{repo_url} has {star_count} stars.")
+            # star_count = int(star_count_text.replace(',', ''))
+            return star_count
+        else:
+            print("Star element not found.")
+            return None
+    except Exception as e:
+        print(f"Error getting star count for {repo_url}: {e}")
+        return None
+
+
+# Read the Excel file into a DataFrame
+excel_file = 'Omega Top 10,000 Projects.xlsx' # Replace with your Excel file path
+df = pd.read_excel(excel_file)
+
+df = df.drop_duplicates(subset=['URL'])
+# Authenticate to GitHub API with a personal access token
+# g = Github('your_github_token') # Replace with your GitHub token
+
+# Iterate over the 'URL' column and check for workflow files
+# df['Has_Workflow'] = df['URL'].apply(has_workflow_yaml_files)
+
+# Get the star count for each repository
+df['Star_Count'] = df['URL'].apply(lambda x: get_star_count(x))
+
+# Sort the DataFrame by star count in descending order
+df_sorted = df.sort_values(by='Star_Count', ascending=False)
+
+# Identify the top 50 starred repositories
+top_50_starred = df_sorted.head(750)
+
+# Save the results to new Excel files
+# df.to_excel('repos_with_workflow_info.xlsx', index=False)
+top_50_starred.to_excel('top_750_starred_repos.xlsx', index=False)
diff --git a/yaml_runner/private_advisory_checker.py b/yaml_runner/private_advisory_checker.py
@@ -0,0 +1,57 @@
+import pandas as pd
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from webdriver_manager.chrome import ChromeDriverManager
+import time
+
+# Load the Excel file into a pandas DataFrame
+df = pd.read_excel('top_750_starred_repos.xlsx')
+
+# Initialize Chrome options for WebDriver
+options = Options()
+options.add_argument("--headless")  # Uncomment if you want the browser to open visibly
+
+# Initialize the Chrome driver
+driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
+
+# Authenticate once
+driver.get("https://github.com/login")
+time.sleep(2)  # Wait for the login page to load
+
+# Input GitHub username and password
+driver.find_element(By.ID, "login_field").send_keys("user_name")
+driver.find_element(By.ID, "password").send_keys("password")
+driver.find_element(By.ID, "password").send_keys(Keys.RETURN)
+
+# Pause here for manual 2FA
+# input("Complete the 2FA/device verification in the browser, then press Enter here to continue...")
+
+# Function to check for the "Report a vulnerability" feature
+def check_vulnerability_feature(repository_url):
+    driver.get(repository_url)
+    time.sleep(2)  # Wait for the page to load
+    elements = driver.find_elements(By.CSS_SELECTOR, 'h1[data-view-component="true"].Subhead-heading.Subhead-heading--large')
+    return any("Report a vulnerability" in element.text for element in elements)
+
+# List to store the results
+vulnerability_feature_enabled = []
+
+# Loop through the DataFrame and check each URL
+for index, row in df.iterrows():
+    repository_url = row['URL'] + "/security/advisories/new"
+    feature_exists = check_vulnerability_feature(repository_url)
+    print(f"Feature exists for {repository_url}: {feature_exists}")
+    vulnerability_feature_enabled.append(feature_exists)
+
+# Add the results to the DataFrame
+df['Vulnerability Feature Enabled'] = vulnerability_feature_enabled
+
+# Write the updated DataFrame to a new Excel file
+df.to_excel('chrome_checked.xlsx', index=False)
+
+# Close the WebDriver
+driver.quit()
+
diff --git a/yaml_runner/yaml_checker.py b/yaml_runner/yaml_checker.py
@@ -0,0 +1,60 @@
+import pandas as pd
+import requests
+
+# Function to check for YAML files in the .github/workflows directory and dependabot.yml in the .github directory
+def check_repository_files(owner, repo, github_token):
+    workflows_api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/.github/workflows"
+    dependabot_api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/.github/dependabot.yml"
+    headers = {"Authorization": f"token {github_token}"}
+    has_yaml_in_workflows = False
+    has_dependabot = False
+
+    # Check for YAML files in the workflows directory
+    try:
+        response = requests.get(workflows_api_url, headers=headers)
+        response.raise_for_status()
+        for file in response.json():
+            if file['name'].endswith('.yml') or file['name'].endswith('.yaml'):
+                has_yaml_in_workflows = True
+                break  # Found a YAML file, no need to check further
+    except requests.HTTPError as e:
+        print(f"Error checking workflows in {owner}/{repo}: {e}")
+
+    # Check for dependabot.yml file in the .github directory
+    try:
+        response = requests.get(dependabot_api_url, headers=headers)
+        if response.status_code == 200:
+            has_dependabot = True
+    except requests.HTTPError as e:
+        print(f"Error checking dependabot.yml in {owner}/{repo}: {e}")
+
+    return has_yaml_in_workflows, has_dependabot
+
+# Replace 'your_github_token_here' with your GitHub Personal Access Token
+github_token = 'Put your own token'
+
+# Assuming you have a DataFrame 'df' loaded from an Excel file as before
+# Update the path to your actual Excel file path
+input_file_path = 'chrome_checked.xlsx'
+df = pd.read_excel(input_file_path)
+
+# Adding columns for the check results
+df['Has YAML in Workflows'] = False
+df['Has Dependabot'] = False
+
+# Iterate over each row in the DataFrame and update the check results
+for index, row in df.iterrows():
+    repo_url = row['URL']
+    owner_repo = repo_url.replace("https://github.com/", "").split('/')
+    if len(owner_repo) == 2:
+        owner, repo = owner_repo
+        has_yaml, has_dependabot = check_repository_files(owner, repo, github_token)
+        df.at[index, 'Has YAML in Workflows'] = has_yaml
+        df.at[index, 'Has Dependabot'] = has_dependabot
+
+# Specify the output Excel file path and save the updated DataFrame
+output_file_path = 'processed_repos_chrome.xlsx'
+df.to_excel(output_file_path, index=False)
+
+print("Process completed. Check the output Excel file for results.")
+