From e399c19c0918ca7f8d2484756aa866a6fe1d88f6 Mon Sep 17 00:00:00 2001 From: estebanwasinger Date: Fri, 8 Nov 2024 09:18:31 -0300 Subject: [PATCH 1/2] feat: Add Apt Ratings scrapper --- .../inputs/get_apartmentratings_reviews.js | 16 +++ .../get_apartmentratings_reviews_request.js | 16 +++ backend/scrapers.py | 19 ++- src/aptratings.py | 116 ++++++++++++++++++ 4 files changed, 166 insertions(+), 1 deletion(-) create mode 100644 backend/inputs/get_apartmentratings_reviews.js create mode 100644 backend/inputs/get_apartmentratings_reviews_request.js create mode 100644 src/aptratings.py diff --git a/backend/inputs/get_apartmentratings_reviews.js b/backend/inputs/get_apartmentratings_reviews.js new file mode 100644 index 000000000..7e9bfcdf2 --- /dev/null +++ b/backend/inputs/get_apartmentratings_reviews.js @@ -0,0 +1,16 @@ +/** + * @typedef {import('../../frontend/node_modules/botasaurus-controls/dist/index').Controls} Controls + */ + + +/** + * @param {Controls} controls + */ +function getInput(controls) { + controls + .link('url', { + placeholder: "https://www.apartmentratings.com/ga/jonesboro/villas-by-the-lake_678817444130328/", + label: 'Property URL', + isRequired: true + }) +} \ No newline at end of file diff --git a/backend/inputs/get_apartmentratings_reviews_request.js b/backend/inputs/get_apartmentratings_reviews_request.js new file mode 100644 index 000000000..7e9bfcdf2 --- /dev/null +++ b/backend/inputs/get_apartmentratings_reviews_request.js @@ -0,0 +1,16 @@ +/** + * @typedef {import('../../frontend/node_modules/botasaurus-controls/dist/index').Controls} Controls + */ + + +/** + * @param {Controls} controls + */ +function getInput(controls) { + controls + .link('url', { + placeholder: "https://www.apartmentratings.com/ga/jonesboro/villas-by-the-lake_678817444130328/", + label: 'Property URL', + isRequired: true + }) +} \ No newline at end of file diff --git a/backend/scrapers.py b/backend/scrapers.py index 0dc3a9aa1..2372d215e 100644 --- a/backend/scrapers.py +++ b/backend/scrapers.py @@ -1,6 +1,7 @@ from urllib.parse import urlparse import re from botasaurus_server.server import Server +from src.aptratings import get_apartmentratings_reviews, get_apartmentratings_reviews_request from src.gmaps import google_maps_scraper, website_contacts_scraper import random from botasaurus_server.ui import View, Field, ExpandDictField, ExpandListField, filters, sorts, CustomField @@ -328,6 +329,7 @@ def show_if(input_data): ], remove_duplicates_by="place_id" ) + except: Server.add_scraper( google_maps_scraper, @@ -385,7 +387,22 @@ def get_website_contacts_scraper_task_name(data): "emails", "phones", "linkedin", "twitter", "facebook", "youtube", "instagram", "github", "snapchat", "tiktok" ] - +Server.add_scraper( + get_apartmentratings_reviews, + create_all_task=True, + split_task=lambda data: [{"url": data["url"]}], + get_task_name=lambda data: data["url"], + sorts=[ + sorts.AlphabeticAscendingSort("website"), + sorts.AlphabeticDescendingSort("website"), + ], + ) +Server.add_scraper( + get_apartmentratings_reviews_request, + create_all_task=True, + split_task=lambda data: [{"url": data["url"]}], + get_task_name=lambda data: data["url"], +) Server.add_scraper( website_contacts_scraper, get_task_name=get_website_contacts_scraper_task_name, diff --git a/src/aptratings.py b/src/aptratings.py new file mode 100644 index 000000000..870e227c3 --- /dev/null +++ b/src/aptratings.py @@ -0,0 +1,116 @@ +from datetime import datetime +from botasaurus.browser import browser, Driver +from botasaurus.request import request, Request +import bs4 +import os +import json + +ZENROWS_USER = os.getenv("ZENROWS_USER") +ZENROWS_PWD = os.getenv("ZENROWS_PWD") +ZENROWS_API_KEY = os.getenv("ZENROWS_API_KEY") + +def extract_review_data(review_object): + data = {} + + # Extract Review Creation Date + if 'dates' in review_object and 'earliest_create_date_dt' in review_object['dates']: + data['review_creation_date'] = review_object['dates']['earliest_create_date_dt']["time"] + if 'review_creation_date' in data: + data['review_creation_date'] = datetime.fromtimestamp(data['review_creation_date'] / 1000).isoformat() + + # Extract Review Text + if 'strings' in review_object and 'review_text_s' in review_object['strings']: + data['review_text'] = review_object['strings']['review_text_s'] + elif 'text' in review_object: + data['review_text'] = review_object['text'] + + # Extract Rating + if 'floats' in review_object and 'rating_overall_f' in review_object['floats']: + data['rating'] = review_object['floats']['rating_overall_f'] + + # Extract Writer + if 'strings' in review_object and 'author_s' in review_object['strings']: + data['writer'] = review_object['strings']['author_s'] + + # Extract Review ID + if 'id' in review_object: + data['review_id'] = review_object['id'] + + # Extract Owner Response + if 'strings' in review_object and 'response_text_s' in review_object['strings']: + data['owner_response'] = review_object['strings']['response_text_s'] + + # Extract Owner Response Date + if 'dates' in review_object and 'response_date_created_dt' in review_object['dates']: + data['owner_response_date'] = review_object['dates']['response_date_created_dt']["time"] + if 'owner_response_date' in data: + data['owner_response_date'] = datetime.fromtimestamp(data['owner_response_date'] / 1000).isoformat() + + return data + +def extract_reviews_from_html(html): + soup = bs4.BeautifulSoup(html, 'html.parser') + scripts = soup.find_all('script') + reviews = [] + + for script in scripts: + if "searchReviews" in script.text: + queries = json.loads(script.text).get('props').get('pageProps').get('dehydratedState').get('queries') + for query in queries: + if "searchReviews" in query.get('queryHash'): + for r in query.get('state').get('data').get('reviews'): + review = extract_review_data(r[0]) + reviews.append(review) + return reviews + + +def parse_pagination_info(html_content): + soup = bs4.BeautifulSoup(html_content, 'html.parser') + + # Find the main pagination container + pagination_container = soup.find('div', class_='pagination') + + # Extract total reviews + results_text = pagination_container.find('div', class_='Styles__Results-sc-1s0ur21-2').text + total_reviews = int(results_text.split('of')[1].strip().split()[0]) + + # Extract total pages by counting the page buttons + page_buttons = pagination_container.find_all('span', class_='Styles__ButtonText-sc-1s0ur21-4') + total_pages = max(int(button.text) for button in page_buttons if button.text.isdigit()) + + return total_reviews, total_pages + +def fetch_reviews(url, fetch_html_func): + html = fetch_html_func(url) + reviews = [] + + total_reviews, total_pages = parse_pagination_info(html) + reviews.extend(extract_reviews_from_html(html)) + print(total_reviews, total_pages) + + for page in range(2, total_pages + 1): + new_url = f"{url}&page={page}" + print(new_url) + html = fetch_html_func(new_url) + reviews.extend(extract_reviews_from_html(html)) + + return reviews + +@browser(proxy=f"http://{ZENROWS_USER}:{ZENROWS_PWD}@superproxy.zenrows.com:1337") +def get_apartmentratings_reviews(driver: Driver, data): + url = data['url'] + return fetch_reviews(url, lambda u: driver.get(u) or driver.page_html) + +@request() +def get_apartmentratings_reviews_request(request: Request, data): + url = data['url'] + params = { + 'url': url, + 'apikey': ZENROWS_API_KEY, + 'js_render': 'true' + } + def fetch_html_func(u): + response = request.get('https://api.zenrows.com/v1/', params={**params, 'url': u}) + return response.text + + return fetch_reviews(url, fetch_html_func) \ No newline at end of file From 36d4e0450f9328470e9bceb2f1c8c1635e0a4073 Mon Sep 17 00:00:00 2001 From: estebanwasinger Date: Fri, 8 Nov 2024 09:49:35 -0300 Subject: [PATCH 2/2] feat: change zenrows request to Proxy Service --- src/aptratings.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/aptratings.py b/src/aptratings.py index 870e227c3..6132e35c1 100644 --- a/src/aptratings.py +++ b/src/aptratings.py @@ -7,7 +7,7 @@ ZENROWS_USER = os.getenv("ZENROWS_USER") ZENROWS_PWD = os.getenv("ZENROWS_PWD") -ZENROWS_API_KEY = os.getenv("ZENROWS_API_KEY") +PROXY_SERVICE_API_KEY = os.getenv("PROXY_SERVICE_API_KEY") def extract_review_data(review_object): data = {} @@ -104,13 +104,25 @@ def get_apartmentratings_reviews(driver: Driver, data): @request() def get_apartmentratings_reviews_request(request: Request, data): url = data['url'] - params = { - 'url': url, - 'apikey': ZENROWS_API_KEY, - 'js_render': 'true' + headers = { + 'Authorization': PROXY_SERVICE_API_KEY, + 'Content-Type': 'application/json' + } + payload = { + "url": url, + "method": "GET", + "proxy_source": "zenrows", + "response_type": "html", + "proxy_settings": { + "asp": True, + "premium_proxy": False + }, + "body": None, + "headers": None } def fetch_html_func(u): - response = request.get('https://api.zenrows.com/v1/', params={**params, 'url': u}) + payload["url"] = u + response = request.post('https://proxy-service.whykeyway.com/get_data', headers=headers, json=payload) return response.text return fetch_reviews(url, fetch_html_func) \ No newline at end of file