Skip to content

Commit

Permalink
Vignesh's scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
vcharapa committed Apr 26, 2024
1 parent 63b503a commit 4bb4cc8
Show file tree
Hide file tree
Showing 3 changed files with 203 additions and 0 deletions.
86 changes: 86 additions & 0 deletions yaml_runner/github_workflow_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import requests
import pandas as pd
from github import Github

from bs4 import BeautifulSoup

def convert_to_number(s):
# Define a dictionary for suffix multipliers
multipliers = {'k': 1000, 'm': 1000000, 'b': 1000000000}

# Check if the last character is a digit
if s[-1].isdigit():
return int(s)

# Extract the numeric part and the suffix
numeric_part, suffix = s[:-1], s[-1].lower()

# Convert the numeric part to a float
numeric_value = float(numeric_part)

# Multiply the numeric value by the corresponding multiplier
if suffix in multipliers:
return int(numeric_value * multipliers[suffix])
else:
raise ValueError(f"Unknown suffix '{suffix}' in input.")

def get_star_count(repo_url):
try:
response = requests.get(repo_url)
response.raise_for_status()

soup = BeautifulSoup(response.content, 'html.parser')

# Find the <a> element that contains the '/stargazers' in its href attribute
# and is closest to the actual star count display
star_element = soup.find('a', href=lambda href: href and 'stargazers' in href)

# Assuming the star count is the next sibling or closely following the found element,
# often the actual star count might be in a span or directly within the <a> tag but needs to be visible
# so we look for the parent and then for the element that contains the star count if not directly found.
if star_element:
star_count_text = star_element.get_text(strip=True)
if not star_count_text.isdigit():
# If the direct text is not a digit, try finding a span or sibling that contains digits
star_count_span = star_element.find_next_sibling('span')
if star_count_span:
star_count_text = star_count_span.get_text(strip=True)

# Convert the star count text to an integer
star_count_text = star_count_text.split('star')[0].strip()
star_count_text = convert_to_number(star_count_text)
star_count = int(star_count_text)
print(f"{repo_url} has {star_count} stars.")
# star_count = int(star_count_text.replace(',', ''))
return star_count
else:
print("Star element not found.")
return None
except Exception as e:
print(f"Error getting star count for {repo_url}: {e}")
return None


# Read the Excel file into a DataFrame
excel_file = 'Omega Top 10,000 Projects.xlsx' # Replace with your Excel file path
df = pd.read_excel(excel_file)

df = df.drop_duplicates(subset=['URL'])
# Authenticate to GitHub API with a personal access token
# g = Github('your_github_token') # Replace with your GitHub token

# Iterate over the 'URL' column and check for workflow files
# df['Has_Workflow'] = df['URL'].apply(has_workflow_yaml_files)

# Get the star count for each repository
df['Star_Count'] = df['URL'].apply(lambda x: get_star_count(x))

# Sort the DataFrame by star count in descending order
df_sorted = df.sort_values(by='Star_Count', ascending=False)

# Identify the top 50 starred repositories
top_50_starred = df_sorted.head(750)

# Save the results to new Excel files
# df.to_excel('repos_with_workflow_info.xlsx', index=False)
top_50_starred.to_excel('top_750_starred_repos.xlsx', index=False)
57 changes: 57 additions & 0 deletions yaml_runner/private_advisory_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# Load the Excel file into a pandas DataFrame
df = pd.read_excel('top_750_starred_repos.xlsx')

# Initialize Chrome options for WebDriver
options = Options()
options.add_argument("--headless") # Uncomment if you want the browser to open visibly

# Initialize the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Authenticate once
driver.get("https://github.com/login")
time.sleep(2) # Wait for the login page to load

# Input GitHub username and password
driver.find_element(By.ID, "login_field").send_keys("user_name")
driver.find_element(By.ID, "password").send_keys("password")
driver.find_element(By.ID, "password").send_keys(Keys.RETURN)

# Pause here for manual 2FA
# input("Complete the 2FA/device verification in the browser, then press Enter here to continue...")

# Function to check for the "Report a vulnerability" feature
def check_vulnerability_feature(repository_url):
driver.get(repository_url)
time.sleep(2) # Wait for the page to load
elements = driver.find_elements(By.CSS_SELECTOR, 'h1[data-view-component="true"].Subhead-heading.Subhead-heading--large')
return any("Report a vulnerability" in element.text for element in elements)

# List to store the results
vulnerability_feature_enabled = []

# Loop through the DataFrame and check each URL
for index, row in df.iterrows():
repository_url = row['URL'] + "/security/advisories/new"
feature_exists = check_vulnerability_feature(repository_url)
print(f"Feature exists for {repository_url}: {feature_exists}")
vulnerability_feature_enabled.append(feature_exists)

# Add the results to the DataFrame
df['Vulnerability Feature Enabled'] = vulnerability_feature_enabled

# Write the updated DataFrame to a new Excel file
df.to_excel('chrome_checked.xlsx', index=False)

# Close the WebDriver
driver.quit()

60 changes: 60 additions & 0 deletions yaml_runner/yaml_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pandas as pd
import requests

# Function to check for YAML files in the .github/workflows directory and dependabot.yml in the .github directory
def check_repository_files(owner, repo, github_token):
workflows_api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/.github/workflows"
dependabot_api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/.github/dependabot.yml"
headers = {"Authorization": f"token {github_token}"}
has_yaml_in_workflows = False
has_dependabot = False

# Check for YAML files in the workflows directory
try:
response = requests.get(workflows_api_url, headers=headers)
response.raise_for_status()
for file in response.json():
if file['name'].endswith('.yml') or file['name'].endswith('.yaml'):
has_yaml_in_workflows = True
break # Found a YAML file, no need to check further
except requests.HTTPError as e:
print(f"Error checking workflows in {owner}/{repo}: {e}")

# Check for dependabot.yml file in the .github directory
try:
response = requests.get(dependabot_api_url, headers=headers)
if response.status_code == 200:
has_dependabot = True
except requests.HTTPError as e:
print(f"Error checking dependabot.yml in {owner}/{repo}: {e}")

return has_yaml_in_workflows, has_dependabot

# Replace 'your_github_token_here' with your GitHub Personal Access Token
github_token = 'Put your own token'

# Assuming you have a DataFrame 'df' loaded from an Excel file as before
# Update the path to your actual Excel file path
input_file_path = 'chrome_checked.xlsx'
df = pd.read_excel(input_file_path)

# Adding columns for the check results
df['Has YAML in Workflows'] = False
df['Has Dependabot'] = False

# Iterate over each row in the DataFrame and update the check results
for index, row in df.iterrows():
repo_url = row['URL']
owner_repo = repo_url.replace("https://github.com/", "").split('/')
if len(owner_repo) == 2:
owner, repo = owner_repo
has_yaml, has_dependabot = check_repository_files(owner, repo, github_token)
df.at[index, 'Has YAML in Workflows'] = has_yaml
df.at[index, 'Has Dependabot'] = has_dependabot

# Specify the output Excel file path and save the updated DataFrame
output_file_path = 'processed_repos_chrome.xlsx'
df.to_excel(output_file_path, index=False)

print("Process completed. Check the output Excel file for results.")

0 comments on commit 4bb4cc8

Please sign in to comment.