-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
203 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
import requests | ||
import pandas as pd | ||
from github import Github | ||
|
||
from bs4 import BeautifulSoup | ||
|
||
def convert_to_number(s): | ||
# Define a dictionary for suffix multipliers | ||
multipliers = {'k': 1000, 'm': 1000000, 'b': 1000000000} | ||
|
||
# Check if the last character is a digit | ||
if s[-1].isdigit(): | ||
return int(s) | ||
|
||
# Extract the numeric part and the suffix | ||
numeric_part, suffix = s[:-1], s[-1].lower() | ||
|
||
# Convert the numeric part to a float | ||
numeric_value = float(numeric_part) | ||
|
||
# Multiply the numeric value by the corresponding multiplier | ||
if suffix in multipliers: | ||
return int(numeric_value * multipliers[suffix]) | ||
else: | ||
raise ValueError(f"Unknown suffix '{suffix}' in input.") | ||
|
||
def get_star_count(repo_url): | ||
try: | ||
response = requests.get(repo_url) | ||
response.raise_for_status() | ||
|
||
soup = BeautifulSoup(response.content, 'html.parser') | ||
|
||
# Find the <a> element that contains the '/stargazers' in its href attribute | ||
# and is closest to the actual star count display | ||
star_element = soup.find('a', href=lambda href: href and 'stargazers' in href) | ||
|
||
# Assuming the star count is the next sibling or closely following the found element, | ||
# often the actual star count might be in a span or directly within the <a> tag but needs to be visible | ||
# so we look for the parent and then for the element that contains the star count if not directly found. | ||
if star_element: | ||
star_count_text = star_element.get_text(strip=True) | ||
if not star_count_text.isdigit(): | ||
# If the direct text is not a digit, try finding a span or sibling that contains digits | ||
star_count_span = star_element.find_next_sibling('span') | ||
if star_count_span: | ||
star_count_text = star_count_span.get_text(strip=True) | ||
|
||
# Convert the star count text to an integer | ||
star_count_text = star_count_text.split('star')[0].strip() | ||
star_count_text = convert_to_number(star_count_text) | ||
star_count = int(star_count_text) | ||
print(f"{repo_url} has {star_count} stars.") | ||
# star_count = int(star_count_text.replace(',', '')) | ||
return star_count | ||
else: | ||
print("Star element not found.") | ||
return None | ||
except Exception as e: | ||
print(f"Error getting star count for {repo_url}: {e}") | ||
return None | ||
|
||
|
||
# Read the Excel file into a DataFrame | ||
excel_file = 'Omega Top 10,000 Projects.xlsx' # Replace with your Excel file path | ||
df = pd.read_excel(excel_file) | ||
|
||
df = df.drop_duplicates(subset=['URL']) | ||
# Authenticate to GitHub API with a personal access token | ||
# g = Github('your_github_token') # Replace with your GitHub token | ||
|
||
# Iterate over the 'URL' column and check for workflow files | ||
# df['Has_Workflow'] = df['URL'].apply(has_workflow_yaml_files) | ||
|
||
# Get the star count for each repository | ||
df['Star_Count'] = df['URL'].apply(lambda x: get_star_count(x)) | ||
|
||
# Sort the DataFrame by star count in descending order | ||
df_sorted = df.sort_values(by='Star_Count', ascending=False) | ||
|
||
# Identify the top 50 starred repositories | ||
top_50_starred = df_sorted.head(750) | ||
|
||
# Save the results to new Excel files | ||
# df.to_excel('repos_with_workflow_info.xlsx', index=False) | ||
top_50_starred.to_excel('top_750_starred_repos.xlsx', index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import pandas as pd | ||
from selenium import webdriver | ||
from selenium.webdriver.common.keys import Keys | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.chrome.service import Service | ||
from selenium.webdriver.chrome.options import Options | ||
from webdriver_manager.chrome import ChromeDriverManager | ||
import time | ||
|
||
# Load the Excel file into a pandas DataFrame | ||
df = pd.read_excel('top_750_starred_repos.xlsx') | ||
|
||
# Initialize Chrome options for WebDriver | ||
options = Options() | ||
options.add_argument("--headless") # Uncomment if you want the browser to open visibly | ||
|
||
# Initialize the Chrome driver | ||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) | ||
|
||
# Authenticate once | ||
driver.get("https://github.com/login") | ||
time.sleep(2) # Wait for the login page to load | ||
|
||
# Input GitHub username and password | ||
driver.find_element(By.ID, "login_field").send_keys("user_name") | ||
driver.find_element(By.ID, "password").send_keys("password") | ||
driver.find_element(By.ID, "password").send_keys(Keys.RETURN) | ||
|
||
# Pause here for manual 2FA | ||
# input("Complete the 2FA/device verification in the browser, then press Enter here to continue...") | ||
|
||
# Function to check for the "Report a vulnerability" feature | ||
def check_vulnerability_feature(repository_url): | ||
driver.get(repository_url) | ||
time.sleep(2) # Wait for the page to load | ||
elements = driver.find_elements(By.CSS_SELECTOR, 'h1[data-view-component="true"].Subhead-heading.Subhead-heading--large') | ||
return any("Report a vulnerability" in element.text for element in elements) | ||
|
||
# List to store the results | ||
vulnerability_feature_enabled = [] | ||
|
||
# Loop through the DataFrame and check each URL | ||
for index, row in df.iterrows(): | ||
repository_url = row['URL'] + "/security/advisories/new" | ||
feature_exists = check_vulnerability_feature(repository_url) | ||
print(f"Feature exists for {repository_url}: {feature_exists}") | ||
vulnerability_feature_enabled.append(feature_exists) | ||
|
||
# Add the results to the DataFrame | ||
df['Vulnerability Feature Enabled'] = vulnerability_feature_enabled | ||
|
||
# Write the updated DataFrame to a new Excel file | ||
df.to_excel('chrome_checked.xlsx', index=False) | ||
|
||
# Close the WebDriver | ||
driver.quit() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import pandas as pd | ||
import requests | ||
|
||
# Function to check for YAML files in the .github/workflows directory and dependabot.yml in the .github directory | ||
def check_repository_files(owner, repo, github_token): | ||
workflows_api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/.github/workflows" | ||
dependabot_api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/.github/dependabot.yml" | ||
headers = {"Authorization": f"token {github_token}"} | ||
has_yaml_in_workflows = False | ||
has_dependabot = False | ||
|
||
# Check for YAML files in the workflows directory | ||
try: | ||
response = requests.get(workflows_api_url, headers=headers) | ||
response.raise_for_status() | ||
for file in response.json(): | ||
if file['name'].endswith('.yml') or file['name'].endswith('.yaml'): | ||
has_yaml_in_workflows = True | ||
break # Found a YAML file, no need to check further | ||
except requests.HTTPError as e: | ||
print(f"Error checking workflows in {owner}/{repo}: {e}") | ||
|
||
# Check for dependabot.yml file in the .github directory | ||
try: | ||
response = requests.get(dependabot_api_url, headers=headers) | ||
if response.status_code == 200: | ||
has_dependabot = True | ||
except requests.HTTPError as e: | ||
print(f"Error checking dependabot.yml in {owner}/{repo}: {e}") | ||
|
||
return has_yaml_in_workflows, has_dependabot | ||
|
||
# Replace 'your_github_token_here' with your GitHub Personal Access Token | ||
github_token = 'Put your own token' | ||
|
||
# Assuming you have a DataFrame 'df' loaded from an Excel file as before | ||
# Update the path to your actual Excel file path | ||
input_file_path = 'chrome_checked.xlsx' | ||
df = pd.read_excel(input_file_path) | ||
|
||
# Adding columns for the check results | ||
df['Has YAML in Workflows'] = False | ||
df['Has Dependabot'] = False | ||
|
||
# Iterate over each row in the DataFrame and update the check results | ||
for index, row in df.iterrows(): | ||
repo_url = row['URL'] | ||
owner_repo = repo_url.replace("https://github.com/", "").split('/') | ||
if len(owner_repo) == 2: | ||
owner, repo = owner_repo | ||
has_yaml, has_dependabot = check_repository_files(owner, repo, github_token) | ||
df.at[index, 'Has YAML in Workflows'] = has_yaml | ||
df.at[index, 'Has Dependabot'] = has_dependabot | ||
|
||
# Specify the output Excel file path and save the updated DataFrame | ||
output_file_path = 'processed_repos_chrome.xlsx' | ||
df.to_excel(output_file_path, index=False) | ||
|
||
print("Process completed. Check the output Excel file for results.") | ||
|