Skip to content
This repository has been archived by the owner on Dec 1, 2023. It is now read-only.

Commit

Permalink
Automatical Google Scrapping added
Browse files Browse the repository at this point in the history
  • Loading branch information
Copy05 committed Apr 10, 2023
1 parent e365b48 commit c0d58cd
Showing 1 changed file with 98 additions and 28 deletions.
126 changes: 98 additions & 28 deletions AntiCopyright.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,51 +2,32 @@
import time
import urllib.parse
import warnings
from urllib.parse import urlparse

import requests
import os.path

from bs4 import BeautifulSoup
from art import *
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

tprint("AntiCopyright")
print("------------------------------[ By Agesoft ]--------------------------------")

print("Setting up ChromeDriver")

# Shut Up!
warnings.filterwarnings("ignore", category=DeprecationWarning)
CLASSNAME = "yuRUbf"

# Setting up ChromeDriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options)

print("[1] Scrap Google Search\n")

choice = int(input("Enter Option: "))

if choice == 1:

start_time = time.time() # Start Timestamp
links = [] # URLs
sitetext = [] # Site Text
uniqueURLs = [] # Unique URLs
def ScrapGoogle():
start_time = time.time() # Start Timestamp
links = [] # URLs
sitetext = [] # Site Text
uniqueURLs = [] # Unique URLs

query = str(input("Enter search query: "))
pages = int(input("How many pages should be scanned: "))

# Scrapping each Google Search site
for page in range(1, pages):
print(f"Search Page: {page}")
url = "http://www.google.com/search?q=" + query + "&start=" + str((page - 1) * 10)
url = "http://www.google.com/search?hl=en&q=" + query + "&start=" + str((page - 1) * 10)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'lxml')

elements = soup.find_all('div', class_="yuRUbf")
elements = soup.find_all('div', class_=CLASSNAME)

for h in elements:
link = h.a.get('href')
Expand All @@ -73,7 +54,7 @@
if phn not in uniqueURLs:
uniqueURLs.append(phn)

file.write(f"""(c) Copyright 2023 Agesoft. All Rights Reserved
file.write(f"""(c) Copyright {datetime.date.today().year} Agesoft. All Rights Reserved
AntiCopyright Google Scrapping report from {datetime.date.today()}\n
Details:
Query: "{query}"
Expand All @@ -94,6 +75,95 @@

print(f"Successfully Scrapped! [{round(time.time() - start_time)}s]")

tprint("AntiCopyright")
print("------------------------------[ By Agesoft ]--------------------------------")

print("Setting up ChromeDriver")

# Shut Up!
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Setting up ChromeDriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options)

print("[1] Scrap Google Search")
print("[2] Automatically Scrap Google Search\n")

choice = int(input("Enter Option: "))

if choice == 1:
ScrapGoogle()

if choice == 2:

links = [] # URLs
sitetext = [] # Site Text
uniqueURLs = [] # Unique URLs
pages = 1000
scannedpages = 1

try:
query = str(input("Enter search query: "))

# Scrapping each Google Search site
print("Starting scrapping:\n\n")
while True:
for page in range(1, pages):
print(f"Search Page: {page}")
url = "http://www.google.com/search?hl=en&q=" + query + "&start=" + str((page - 1) * 10)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'lxml')

elements = soup.find_all('div', class_=CLASSNAME)

for h in elements:
link = h.a.get('href')
links.append(link)
sitetext.append(h.h3.text)

for i in range(0, len(links)):
print(f"Text: {sitetext[int(i)]}\nLink: {links[int(i)]}\n\n")

time.sleep(5)
scannedpages = scannedpages + 1
except KeyboardInterrupt:
print("Scrapping stopped")

# Creating the file
currentTime = round(time.time() * 1000)
filename = os.path.join("C:\\AntiCopyright", f"AntiCopyright_GSReport_{currentTime}")

if not os.path.isdir("C:\\AntiCopyright"):
os.mkdir("C:\\AntiCopyright")

print("Creating report file.")
file = open(f"{filename}.txt", 'w', encoding="utf-8")
file.write(f"""(c) Copyright {datetime.date.today().year} Agesoft. All Rights Reserved
AntiCopyright Google Scrapping report from {datetime.date.today()}\n
Details:
Query: "{query}"
Indexes Found: {len(links)}
Pages Scanned: {scannedpages}\n\n""")

for i in range(0, len(links)):
file.write(f"Text: {sitetext[int(i)]}\nLink: {links[int(i)]}\n\n")

file.write("URLs:\n")
for i in range(0, len(links)):
parsed = urllib.parse.urlparse(links[int(i)])
phn = parsed.hostname

if phn not in uniqueURLs:
uniqueURLs.append(phn)

for i in range(0, len(uniqueURLs)):
file.write(f"{uniqueURLs[int(i)]}\n")

print(f"[Success] File can be found at: {file.name}")
file.close()

if choice > 1:
print("Error: Invalid Choice")
exit(1)

0 comments on commit c0d58cd

Please sign in to comment.