-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 8748598
Showing
8 changed files
with
197 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
name: run scraper.py | ||
|
||
on: | ||
schedule: | ||
- cron: '0 0 * * *' # At midnight every day | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: checkout repo content | ||
uses: actions/checkout@v2 # checkout the repository content to GitHub runner | ||
|
||
- name: setup python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: '3.9' # install the python version needed | ||
|
||
- name: install python packages | ||
run: | | ||
python -m pip install --upgrade pip | ||
python -m pip install -r requirements.txt | ||
- name: Install Playwright browsers for Python | ||
run: | | ||
python -m playwright install --with-deps | ||
- name: verify installed packages | ||
run: python -m pip list | ||
|
||
- name: execute py script # run scraper.py | ||
run: python scraper.py # No environment variables needed | ||
|
||
- name: commit files | ||
run: | | ||
git config --local user.email "[email protected]" | ||
git config --local user.name "GitHub Action" | ||
git add -A | ||
git diff-index --quiet HEAD || (git commit -a -m "updated logs" --allow-empty) | ||
- name: push changes | ||
uses: ad-m/[email protected] | ||
with: | ||
github_token: ${{ secrets.GITHUB_TOKEN }} | ||
branch: main |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
name: run scraper.py | ||
|
||
on: | ||
workflow_dispatch: | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
steps: | ||
|
||
- name: checkout repo content | ||
uses: actions/checkout@v2 | ||
|
||
- name: setup python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: '3.9' # install the python version needed | ||
|
||
- name: install python packages | ||
run: | | ||
python -m pip install --upgrade pip | ||
python -m pip install -r requirements.txt | ||
- name: Install Playwright browsers for Python | ||
run: | | ||
python -m playwright install --with-deps | ||
- name: verify installed packages | ||
run: python -m pip list | ||
|
||
- name: execute py script # run scraper.py | ||
run: python scraper.py | ||
|
||
- name: commit files | ||
run: | | ||
git config --local user.email "[email protected]" | ||
git config --local user.name "GitHub Action" | ||
git add -A | ||
git diff-index --quiet HEAD || (git commit -a -m "updated logs" --allow-empty) | ||
- name: push changes | ||
uses: ad-m/[email protected] | ||
with: | ||
github_token: ${{ secrets.GITHUB_TOKEN }} | ||
branch: main |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
venv/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# python-twitter-scraper | ||
|
||
for installing the required packages, run the following command: | ||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
for running the script, run the following command: | ||
```bash | ||
python scraper.py | ||
``` | ||
It will open a browser window and ask you to login to twitter. After logging in, you have to press enter in the terminal. The script will then start scraping the tweets and save them in a csv file. |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
selenium |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
from selenium import webdriver | ||
from selenium.webdriver.firefox.service import Service | ||
from selenium.webdriver.common.by import By | ||
import pandas as pd | ||
import time | ||
|
||
|
||
def extract_tweets(tweets): | ||
# Lists to store users and tweets | ||
users = [] | ||
tweets = [] | ||
|
||
# Loop through each container and extract the tweet text and user | ||
for container in tweets: | ||
try: | ||
# Extract the username | ||
user = container.find_element( | ||
By.XPATH, './/div[@data-testid="User-Name"]//span').text | ||
|
||
# Extract the tweet text | ||
tweet = container.find_element( | ||
By.XPATH, './/div[@data-testid="tweetText"]').text | ||
|
||
# Add to the lists | ||
except Exception as e: | ||
print(f"Error extracting data: {e}") | ||
continue | ||
# if no errors, print the user and tweet | ||
print(f"User: {user}") | ||
print(f"Tweet: {tweet}") | ||
users.append(user) | ||
tweets.append(tweet) | ||
return users, tweets | ||
|
||
|
||
# Path to geckodriver | ||
# Replace with the path to your geckodriver | ||
geckodriver_path = "./geckodriver" | ||
|
||
# Firefox browser configuration | ||
options = webdriver.FirefoxOptions() | ||
# Optional: run Firefox in headless mode (without graphical interface) | ||
# options.add_argument('--headless') | ||
service = Service(executable_path=geckodriver_path) | ||
driver = webdriver.Firefox(service=service, options=options) | ||
|
||
# Search URL on X (Twitter) | ||
search_url = "https://x.com/search?q=trump&src=typed_query&f=live" | ||
|
||
# Open the page | ||
driver.get(search_url) | ||
|
||
# Wait a moment for the page to load (you can adjust the time) | ||
|
||
# ask the user for input: "continue?[Y/n]" | ||
input = input("Continue?[Y/n]") | ||
if input == "n": | ||
driver.quit() | ||
print("Exiting program") | ||
exit() | ||
# Lists to store users and tweets | ||
users = [] | ||
tweets = [] | ||
|
||
scroll_count = 15 | ||
for i in range(scroll_count): | ||
# Scroll down to the bottom of the page | ||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | ||
# Wait for the page to load more tweets | ||
time.sleep(40) # Adjust the delay if needed | ||
|
||
# Extract tweet containers | ||
tweets = driver.find_elements(By.XPATH, '//article[@data-testid="tweet"]') | ||
print(f"Extracted {len(tweets)} new tweets") | ||
# Extract the users and tweets | ||
new_users, new_tweets = extract_tweets(tweets) | ||
# Add to the lists | ||
users.extend(new_users) | ||
tweets.extend(new_tweets) | ||
|
||
# open the file in write mode | ||
with open("tweets.csv", "w") as file: | ||
# write the header | ||
file.write("User,Tweet\n") | ||
# write the data | ||
for i, tweet in enumerate(tweets): | ||
print(f"Tweet: {tweet} \nUser: {users[i]}\n =====================\n") | ||
file.write(f"{users[i]},{tweet}\n") | ||
|
||
# Close the browser | ||
driver.quit() | ||
|
||
print("Data extracted and saved to tweets.csv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
User,Tweet |