Skip to content

Commit

Permalink
Merge pull request #4 from unlockre/feat/add-apt-ratings-scrapper
Browse files Browse the repository at this point in the history
feat: Add Apt Ratings scrapper
  • Loading branch information
javisolis-keyway authored Nov 21, 2024
2 parents 96fea01 + 36d4e04 commit 5bcda59
Show file tree
Hide file tree
Showing 4 changed files with 178 additions and 1 deletion.
16 changes: 16 additions & 0 deletions backend/inputs/get_apartmentratings_reviews.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/**
* @typedef {import('../../frontend/node_modules/botasaurus-controls/dist/index').Controls} Controls
*/


/**
* @param {Controls} controls
*/
function getInput(controls) {
controls
.link('url', {
placeholder: "https://www.apartmentratings.com/ga/jonesboro/villas-by-the-lake_678817444130328/",
label: 'Property URL',
isRequired: true
})
}
16 changes: 16 additions & 0 deletions backend/inputs/get_apartmentratings_reviews_request.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/**
* @typedef {import('../../frontend/node_modules/botasaurus-controls/dist/index').Controls} Controls
*/


/**
* @param {Controls} controls
*/
function getInput(controls) {
controls
.link('url', {
placeholder: "https://www.apartmentratings.com/ga/jonesboro/villas-by-the-lake_678817444130328/",
label: 'Property URL',
isRequired: true
})
}
19 changes: 18 additions & 1 deletion backend/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from urllib.parse import urlparse
import re
from botasaurus_server.server import Server
from src.aptratings import get_apartmentratings_reviews, get_apartmentratings_reviews_request
from src.gmaps import google_maps_scraper, website_contacts_scraper
import random
from botasaurus_server.ui import View, Field, ExpandDictField, ExpandListField, filters, sorts, CustomField
Expand Down Expand Up @@ -329,6 +330,7 @@ def show_if(input_data):
],
remove_duplicates_by="place_id"
)

except:
Server.add_scraper(
google_maps_scraper,
Expand Down Expand Up @@ -386,7 +388,22 @@ def get_website_contacts_scraper_task_name(data):
"emails", "phones", "linkedin", "twitter", "facebook",
"youtube", "instagram", "github", "snapchat", "tiktok"
]

Server.add_scraper(
get_apartmentratings_reviews,
create_all_task=True,
split_task=lambda data: [{"url": data["url"]}],
get_task_name=lambda data: data["url"],
sorts=[
sorts.AlphabeticAscendingSort("website"),
sorts.AlphabeticDescendingSort("website"),
],
)
Server.add_scraper(
get_apartmentratings_reviews_request,
create_all_task=True,
split_task=lambda data: [{"url": data["url"]}],
get_task_name=lambda data: data["url"],
)
Server.add_scraper(
website_contacts_scraper,
get_task_name=get_website_contacts_scraper_task_name,
Expand Down
128 changes: 128 additions & 0 deletions src/aptratings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from datetime import datetime
from botasaurus.browser import browser, Driver
from botasaurus.request import request, Request
import bs4
import os
import json

ZENROWS_USER = os.getenv("ZENROWS_USER")
ZENROWS_PWD = os.getenv("ZENROWS_PWD")
PROXY_SERVICE_API_KEY = os.getenv("PROXY_SERVICE_API_KEY")

def extract_review_data(review_object):
data = {}

# Extract Review Creation Date
if 'dates' in review_object and 'earliest_create_date_dt' in review_object['dates']:
data['review_creation_date'] = review_object['dates']['earliest_create_date_dt']["time"]
if 'review_creation_date' in data:
data['review_creation_date'] = datetime.fromtimestamp(data['review_creation_date'] / 1000).isoformat()

# Extract Review Text
if 'strings' in review_object and 'review_text_s' in review_object['strings']:
data['review_text'] = review_object['strings']['review_text_s']
elif 'text' in review_object:
data['review_text'] = review_object['text']

# Extract Rating
if 'floats' in review_object and 'rating_overall_f' in review_object['floats']:
data['rating'] = review_object['floats']['rating_overall_f']

# Extract Writer
if 'strings' in review_object and 'author_s' in review_object['strings']:
data['writer'] = review_object['strings']['author_s']

# Extract Review ID
if 'id' in review_object:
data['review_id'] = review_object['id']

# Extract Owner Response
if 'strings' in review_object and 'response_text_s' in review_object['strings']:
data['owner_response'] = review_object['strings']['response_text_s']

# Extract Owner Response Date
if 'dates' in review_object and 'response_date_created_dt' in review_object['dates']:
data['owner_response_date'] = review_object['dates']['response_date_created_dt']["time"]
if 'owner_response_date' in data:
data['owner_response_date'] = datetime.fromtimestamp(data['owner_response_date'] / 1000).isoformat()

return data

def extract_reviews_from_html(html):
soup = bs4.BeautifulSoup(html, 'html.parser')
scripts = soup.find_all('script')
reviews = []

for script in scripts:
if "searchReviews" in script.text:
queries = json.loads(script.text).get('props').get('pageProps').get('dehydratedState').get('queries')
for query in queries:
if "searchReviews" in query.get('queryHash'):
for r in query.get('state').get('data').get('reviews'):
review = extract_review_data(r[0])
reviews.append(review)
return reviews


def parse_pagination_info(html_content):
soup = bs4.BeautifulSoup(html_content, 'html.parser')

# Find the main pagination container
pagination_container = soup.find('div', class_='pagination')

# Extract total reviews
results_text = pagination_container.find('div', class_='Styles__Results-sc-1s0ur21-2').text
total_reviews = int(results_text.split('of')[1].strip().split()[0])

# Extract total pages by counting the page buttons
page_buttons = pagination_container.find_all('span', class_='Styles__ButtonText-sc-1s0ur21-4')
total_pages = max(int(button.text) for button in page_buttons if button.text.isdigit())

return total_reviews, total_pages

def fetch_reviews(url, fetch_html_func):
html = fetch_html_func(url)
reviews = []

total_reviews, total_pages = parse_pagination_info(html)
reviews.extend(extract_reviews_from_html(html))
print(total_reviews, total_pages)

for page in range(2, total_pages + 1):
new_url = f"{url}&page={page}"
print(new_url)
html = fetch_html_func(new_url)
reviews.extend(extract_reviews_from_html(html))

return reviews

@browser(proxy=f"http://{ZENROWS_USER}:{ZENROWS_PWD}@superproxy.zenrows.com:1337")
def get_apartmentratings_reviews(driver: Driver, data):
url = data['url']
return fetch_reviews(url, lambda u: driver.get(u) or driver.page_html)

@request()
def get_apartmentratings_reviews_request(request: Request, data):
url = data['url']
headers = {
'Authorization': PROXY_SERVICE_API_KEY,
'Content-Type': 'application/json'
}
payload = {
"url": url,
"method": "GET",
"proxy_source": "zenrows",
"response_type": "html",
"proxy_settings": {
"asp": True,
"premium_proxy": False
},
"body": None,
"headers": None
}
def fetch_html_func(u):
payload["url"] = u
response = request.post('https://proxy-service.whykeyway.com/get_data', headers=headers, json=payload)
return response.text

return fetch_reviews(url, fetch_html_func)

0 comments on commit 5bcda59

Please sign in to comment.