Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
arejula27 committed Sep 14, 2024
0 parents commit 8748598
Show file tree
Hide file tree
Showing 8 changed files with 197 additions and 0 deletions.
45 changes: 45 additions & 0 deletions .github/workflows/actions.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: run scraper.py

on:
schedule:
- cron: '0 0 * * *' # At midnight every day

jobs:
build:
runs-on: ubuntu-latest
steps:
- name: checkout repo content
uses: actions/checkout@v2 # checkout the repository content to GitHub runner

- name: setup python
uses: actions/setup-python@v4
with:
python-version: '3.9' # install the python version needed

- name: install python packages
run: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
- name: Install Playwright browsers for Python
run: |
python -m playwright install --with-deps
- name: verify installed packages
run: python -m pip list

- name: execute py script # run scraper.py
run: python scraper.py # No environment variables needed

- name: commit files
run: |
git config --local user.email "[email protected]"
git config --local user.name "GitHub Action"
git add -A
git diff-index --quiet HEAD || (git commit -a -m "updated logs" --allow-empty)
- name: push changes
uses: ad-m/[email protected]
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
branch: main
45 changes: 45 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: run scraper.py

on:
workflow_dispatch:

jobs:
build:
runs-on: ubuntu-latest
steps:

- name: checkout repo content
uses: actions/checkout@v2

- name: setup python
uses: actions/setup-python@v4
with:
python-version: '3.9' # install the python version needed

- name: install python packages
run: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
- name: Install Playwright browsers for Python
run: |
python -m playwright install --with-deps
- name: verify installed packages
run: python -m pip list

- name: execute py script # run scraper.py
run: python scraper.py

- name: commit files
run: |
git config --local user.email "[email protected]"
git config --local user.name "GitHub Action"
git add -A
git diff-index --quiet HEAD || (git commit -a -m "updated logs" --allow-empty)
- name: push changes
uses: ad-m/[email protected]
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
branch: main
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
venv/
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# python-twitter-scraper

for installing the required packages, run the following command:
```bash
pip install -r requirements.txt
```
for running the script, run the following command:
```bash
python scraper.py
```
It will open a browser window and ask you to login to twitter. After logging in, you have to press enter in the terminal. The script will then start scraping the tweets and save them in a csv file.
Binary file added geckodriver
Binary file not shown.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
selenium
93 changes: 93 additions & 0 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
import pandas as pd
import time


def extract_tweets(tweets):
# Lists to store users and tweets
users = []
tweets = []

# Loop through each container and extract the tweet text and user
for container in tweets:
try:
# Extract the username
user = container.find_element(
By.XPATH, './/div[@data-testid="User-Name"]//span').text

# Extract the tweet text
tweet = container.find_element(
By.XPATH, './/div[@data-testid="tweetText"]').text

# Add to the lists
except Exception as e:
print(f"Error extracting data: {e}")
continue
# if no errors, print the user and tweet
print(f"User: {user}")
print(f"Tweet: {tweet}")
users.append(user)
tweets.append(tweet)
return users, tweets


# Path to geckodriver
# Replace with the path to your geckodriver
geckodriver_path = "./geckodriver"

# Firefox browser configuration
options = webdriver.FirefoxOptions()
# Optional: run Firefox in headless mode (without graphical interface)
# options.add_argument('--headless')
service = Service(executable_path=geckodriver_path)
driver = webdriver.Firefox(service=service, options=options)

# Search URL on X (Twitter)
search_url = "https://x.com/search?q=trump&src=typed_query&f=live"

# Open the page
driver.get(search_url)

# Wait a moment for the page to load (you can adjust the time)

# ask the user for input: "continue?[Y/n]"
input = input("Continue?[Y/n]")
if input == "n":
driver.quit()
print("Exiting program")
exit()
# Lists to store users and tweets
users = []
tweets = []

scroll_count = 15
for i in range(scroll_count):
# Scroll down to the bottom of the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait for the page to load more tweets
time.sleep(40) # Adjust the delay if needed

# Extract tweet containers
tweets = driver.find_elements(By.XPATH, '//article[@data-testid="tweet"]')
print(f"Extracted {len(tweets)} new tweets")
# Extract the users and tweets
new_users, new_tweets = extract_tweets(tweets)
# Add to the lists
users.extend(new_users)
tweets.extend(new_tweets)

# open the file in write mode
with open("tweets.csv", "w") as file:
# write the header
file.write("User,Tweet\n")
# write the data
for i, tweet in enumerate(tweets):
print(f"Tweet: {tweet} \nUser: {users[i]}\n =====================\n")
file.write(f"{users[i]},{tweet}\n")

# Close the browser
driver.quit()

print("Data extracted and saved to tweets.csv")
1 change: 1 addition & 0 deletions tweets.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
User,Tweet

0 comments on commit 8748598

Please sign in to comment.