init

arejula27 · Sep 14, 2024 · 8748598 · 8748598
commit 8748598
Show file tree

Hide file tree

Showing 8 changed files with 197 additions and 0 deletions.
diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml
@@ -0,0 +1,45 @@
+name: run scraper.py
+
+on:
+  schedule:
+    - cron: '0 0 * * *' # At midnight every day
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: checkout repo content
+        uses: actions/checkout@v2 # checkout the repository content to GitHub runner
+
+      - name: setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9' # install the python version needed
+
+      - name: install python packages
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -r requirements.txt
+          
+      - name: Install Playwright browsers for Python
+        run: |
+          python -m playwright install --with-deps
+
+      - name: verify installed packages
+        run: python -m pip list
+
+      - name: execute py script # run scraper.py
+        run: python scraper.py # No environment variables needed
+
+      - name: commit files
+        run: |
+          git config --local user.email "[email protected]"
+          git config --local user.name "GitHub Action"
+          git add -A
+          git diff-index --quiet HEAD || (git commit -a -m "updated logs" --allow-empty)
+          
+      - name: push changes
+        uses: ad-m/[email protected]
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          branch: main
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,45 @@
+name: run scraper.py
+
+on:
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+
+      - name: checkout repo content
+        uses: actions/checkout@v2
+
+      - name: setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9' # install the python version needed
+
+      - name: install python packages
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -r requirements.txt
+
+      - name: Install Playwright browsers for Python
+        run: |
+          python -m playwright install --with-deps
+
+      - name: verify installed packages
+        run: python -m pip list
+
+      - name: execute py script # run scraper.py
+        run: python scraper.py
+
+      - name: commit files
+        run: |
+          git config --local user.email "[email protected]"
+          git config --local user.name "GitHub Action"
+          git add -A
+          git diff-index --quiet HEAD || (git commit -a -m "updated logs" --allow-empty)
+
+      - name: push changes
+        uses: ad-m/[email protected]
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          branch: main
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+venv/
diff --git a/README.md b/README.md
@@ -0,0 +1,11 @@
+# python-twitter-scraper
+
+for installing the required packages, run the following command:
+```bash
+pip install -r requirements.txt
+```
+for running the script, run the following command:
+```bash
+python scraper.py
+```
+It will open a browser window and ask you to login to twitter. After logging in, you have to press enter in the terminal. The script will then start scraping the tweets and save them in a csv file.
diff --git a/geckodriver b/geckodriver
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+selenium
diff --git a/scraper.py b/scraper.py
@@ -0,0 +1,93 @@
+from selenium import webdriver
+from selenium.webdriver.firefox.service import Service
+from selenium.webdriver.common.by import By
+import pandas as pd
+import time
+
+
+def extract_tweets(tweets):
+    # Lists to store users and tweets
+    users = []
+    tweets = []
+
+    # Loop through each container and extract the tweet text and user
+    for container in tweets:
+        try:
+            # Extract the username
+            user = container.find_element(
+                By.XPATH, './/div[@data-testid="User-Name"]//span').text
+
+            # Extract the tweet text
+            tweet = container.find_element(
+                By.XPATH, './/div[@data-testid="tweetText"]').text
+
+            # Add to the lists
+        except Exception as e:
+            print(f"Error extracting data: {e}")
+            continue
+        # if no errors, print the user and tweet
+        print(f"User: {user}")
+        print(f"Tweet: {tweet}")
+        users.append(user)
+        tweets.append(tweet)
+    return users, tweets
+
+
+# Path to geckodriver
+# Replace with the path to your geckodriver
+geckodriver_path = "./geckodriver"
+
+# Firefox browser configuration
+options = webdriver.FirefoxOptions()
+# Optional: run Firefox in headless mode (without graphical interface)
+# options.add_argument('--headless')
+service = Service(executable_path=geckodriver_path)
+driver = webdriver.Firefox(service=service, options=options)
+
+# Search URL on X (Twitter)
+search_url = "https://x.com/search?q=trump&src=typed_query&f=live"
+
+# Open the page
+driver.get(search_url)
+
+# Wait a moment for the page to load (you can adjust the time)
+
+# ask the user for input: "continue?[Y/n]"
+input = input("Continue?[Y/n]")
+if input == "n":
+    driver.quit()
+    print("Exiting program")
+    exit()
+# Lists to store users and tweets
+users = []
+tweets = []
+
+scroll_count = 15
+for i in range(scroll_count):
+    # Scroll down to the bottom of the page
+    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+    # Wait for the page to load more tweets
+    time.sleep(40)  # Adjust the delay if needed
+
+    # Extract tweet containers
+    tweets = driver.find_elements(By.XPATH, '//article[@data-testid="tweet"]')
+    print(f"Extracted {len(tweets)} new tweets")
+    # Extract the users and tweets
+    new_users, new_tweets = extract_tweets(tweets)
+    # Add to the lists
+    users.extend(new_users)
+    tweets.extend(new_tweets)
+
+# open the file in write mode
+with open("tweets.csv", "w") as file:
+    # write the header
+    file.write("User,Tweet\n")
+    # write the data
+    for i, tweet in enumerate(tweets):
+        print(f"Tweet: {tweet}  \nUser: {users[i]}\n =====================\n")
+        file.write(f"{users[i]},{tweet}\n")
+
+# Close the browser
+driver.quit()
+
+print("Data extracted and saved to tweets.csv")
diff --git a/tweets.csv b/tweets.csv
@@ -0,0 +1 @@
+User,Tweet