Skip to content
This repository has been archived by the owner on Oct 5, 2022. It is now read-only.

requested changes from commit after issue #5 #8

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 61 additions & 26 deletions google_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import urllib
import argparse
import json
import hashlib
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
Expand All @@ -24,30 +25,47 @@
Author: Rushil Srivastava ([email protected])
'''

# rename a file and its json to sha256 equivalent
def hash_rename(file):
filename = os.path.basename(file)
extension = os.path.splitext(file)[1]
file_json = file.replace(extension,".json")
filename_json = os.path.basename(file_json)
if sys.version_info[0] > 2:
with open(file,"rb") as f:
contents = f.read()
shahash = hashlib.sha256(contents).hexdigest()
else:
with io.open(file,"rb") as f:
contents = f.read()
shahash = hashlib.sha256(contents).hexdigest()
os.rename(file,(file.replace(filename,shahash) + extension))
os.rename(file_json,(file_json.replace(filename_json,shahash) + ".json"))
return

def search(url):
# Create a browser and resize for exact pinpoints
browser = webdriver.Chrome()
browser.set_window_size(1024, 768)
print("\n===============================================\n")
print("[%] Successfully launched Chrome Browser")
print("[%] Successfully launched Chromedriver")

# Open the link
browser.get(url)
time.sleep(1)
print("[%] Successfully opened link.")
print("[%] Link Opened")

element = browser.find_element_by_tag_name("body")

print("[%] Scrolling down.")
print("[%] Scrolling down")
# Scroll down
for i in range(30):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3) # bot id protection

try:
browser.find_element_by_id("smb").click()
print("[%] Successfully clicked 'Show More Button'.")
print("[%] Loaded 'Show More' Element")
for i in range(50):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3) # bot id protection
Expand All @@ -56,26 +74,27 @@ def search(url):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3) # bot id protection

print("[%] Reached end of Page.")
print("[%] Reached the end of the page.")

time.sleep(1)
# Get page source and close the browser
source = browser.page_source
if sys.version_info[0] > 2:
with open('dataset/logs/google/source.html', 'w+', encoding='utf-8', errors='replace') as f:
f.write(source)
f.write(unicode(source))
else:
with io.open('dataset/logs/google/source.html', 'w+', encoding='utf-8', errors='replace') as f:
f.write(source)
with io.open('dataset/logs/google/source.html', 'w', encoding='utf-8') as f:
f.write(unicode(source))

browser.close()
print("[%] Closed Browser.")
print("[%] Closed Chromedriver")
print("\n===============================================\n")

return source


def error(link):
print("[!] Skipping {}. Can't download or no metadata.\n".format(link))
def error(link, e):
print("[!] Issue Downloading: {}\n[!] Error: {}".format(link, e))
file = Path("dataset/logs/google/errors.log".format(query))
if file.is_file():
with open("dataset/logs/google/errors.log".format(query), "a") as myfile:
Expand Down Expand Up @@ -106,40 +125,57 @@ def download_image(link, image_data):
print("[%] Downloading Image #{} from {}".format(
download_image.delta, link))
try:
# Open and download the file
if sys.version_info[0] > 2:
urllib.request.urlretrieve(link, "dataset/google/{}/".format(query) + "Scrapper_{}.{}".format(str(download_image.delta), type))
else:
urllib.urlopen(link, "dataset/google/{}/".format(query) + "Scrapper_{}.{}".format(str(download_image.delta), type))
print("[%] Downloaded File")
with open("dataset/google/{}/Scrapper_{}.json".format(query, str(download_image.delta)), "w") as outfile:
json.dump(image_data, outfile, indent=4)

# dump the json metadata from the URL to the json file
if sys.version_info[0] > 2:
with open("dataset/google/{}/".format(query) + "Scrapper_{}.json".format(str(download_image.delta)), "w", encoding='utf-8', errors='replace') as outfile:
json.dump(image_data, outfile, indent=4)
else:
with io.open("dataset/google/{}/".format(query) + "Scrapper_{}.json".format(str(download_image.delta)), "w", encoding='utf-8') as outfile:
json.dump(image_data, outfile, indent=4)

# rename the files if the 'hash' property has been set
if download_image.hash == True:
hash_rename("dataset/google/{}/".format(query) + "Scrapper_{}.{}".format(str(download_image.delta), type))

except Exception as e:
download_image.delta -= 1
print("[!] Issue Downloading: {}\n[!] Error: {}".format(link, e))
error(link)
download_image.errors += 1
error(link, e)
except Exception as e:
download_image.delta -= 1
print("[!] Issue getting: {}\n[!] Error:: {}".format(link, e))
error(link)
download_image.errors += 1
error(link, e)


if __name__ == "__main__":
# parse command line options
parser = argparse.ArgumentParser()
parser.add_argument("url", help="Give the url that I should parse.")
parser.add_argument("--hash", help="yes/no - Rename the files and json metadata to their sha256 equivalent. Useful for mutated searches potentially involving duplicate downloads.", nargs="?", default="no")
args = parser.parse_args()

# set local vars from user input
query = urlparse.parse_qs(urlparse.urlparse(args.url).query)['q'][0]
url = args.url
if (args.hash).lower() == "yes":
download_image.hash = True
else:
download_image.hash = False

# check directory and create if necessary
if not os.path.isdir("dataset/"):
os.makedirs("dataset/")
if not os.path.isdir("dataset/google/{}".format(query)):
os.makedirs("dataset/google/{}".format(query))
if not os.path.isdir("dataset/logs/google/".format(query)):
os.makedirs("dataset/logs/google/".format(query))
if not os.path.isdir("dataset/logs/google/"):
os.makedirs("dataset/logs/google/")

source = search(url)

Expand All @@ -161,37 +197,36 @@ def download_image(link, image_data):
for i in soup.find_all("div", class_="rg_meta")]
print("[%] Indexed {} Images.".format(len(links)))
print("\n===============================================\n")
print("[%] Getting Image Information.")
print("[%] Getting Image Information.\n")
images = {}
linkcounter = 0
for a in soup.find_all("div", class_="rg_meta"):
print("\n------------------------------------------")
rg_meta = json.loads(a.text)
if 'st' in rg_meta:
title = rg_meta['st']
else:
title = ""
link = rg_meta["ou"]
print("\n[%] Getting info on: {}".format(link))
print("[%] Getting info on: {}".format(link))
try:
image_data = "google", query, rg_meta["pt"], rg_meta["s"], title, link, rg_meta["ru"]
images[link] = image_data
except Exception as e:
images[link] = image_data
print("[!] Issue getting data: {}\n[!] Error: {}".format(rg_meta, e))
print("[!] Issue getting data: {}\n[!] Error: {}\n".format(rg_meta, e))

linkcounter += 1

# Open i processes to download
print("\n------------------------------------------\n")
print("\n===============================================\n")
download_image.delta = 0
download_image.errors = 0
for i, (link) in enumerate(links):
print("\n------------------------------------------\n")
try:
download_image(link, images[link])
except Exception as e:
error(link)
error(link, e)

print("\n\n[%] Done. Downloaded {} images.".format(download_image.delta))
print("\n\n[%] Done. Downloaded images. {} succeeded | {} failed".format(download_image.delta,download_image.errors))
print("\n===============================================\n")