-
Notifications
You must be signed in to change notification settings - Fork 0
/
rightmove.py
161 lines (144 loc) · 5.8 KB
/
rightmove.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# Import necessary modules
import requests
from bs4 import BeautifulSoup
import json
import datetime
import csv
headers={
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0'
}
class RightMoveScrapper:
def __init__(self, url):
# result file creation
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")
self.filename = "%s.csv" % now
self.file_details = open("%s" % self.filename,"w",encoding='UTF-8')
self.writer_details = csv.writer(self.file_details, delimiter=",",lineterminator="\n")
# headers row writing
self.writer_details.writerow([
"Property ID",
"Title",
"Published Date",
"Bed",
"Bath",
"Price",
"Address",
"Distances",
"Latitude",
"Longitude",
"Contact",
"Display Name",
"Property Link"
])
self.url = url
self.visit_listing_page(self.url)
# visit all paginations
def visit_listing_page(self, url):
print("---> %s" % url)
site = requests.get(url, headers=headers).text
soup = BeautifulSoup(site, "html.parser")
json_script = self.get_json_script(soup)
json_data = self.from_string_to_json(json_script)
self.parse_properties(json_data)
pagination = json_data["pagination"]
total_pagination = pagination["total"]
page = pagination["page"]
if int(page) < int(total_pagination):
print(page)
print(total_pagination)
if "&index=" not in url:
new_url = url + "&index=24"
self.visit_listing_page(new_url)
else:
url_splitted = url.split("&index=")
index_number = int(page)*24
new_url = url_splitted[0] + "&index=" + str(index_number) + url_splitted[1].partition("&")[2]
self.visit_listing_page(new_url)
# get json data
def get_json_script(self, soup):
script_tags = soup.find_all("script")
json_script = ""
for tag in script_tags:
if "window.jsonModel = " in tag.text:
json_script = tag.text
return json_script
# parse properties data from json data
def parse_properties(self, json_data):
properties = json_data["properties"]
for property in properties:
property_id = property["id"]
property_title = property["propertyTypeFullDescription"]
published_date = property["firstVisibleDate"]
bedrooms = property["bedrooms"]
bathrooms = property["bathrooms"]
if bathrooms == None:
bathrooms = "0"
displayAddress = property["displayAddress"]
countryCode = property.get("countryCode")
if countryCode:
displayAddress += " %s" % countryCode
latitude = ""
longitude = ""
location = property.get("location")
if location:
latitude = location["latitude"]
longitude = location["longitude"]
price = property["price"]
amount = price["amount"]
customer = property["customer"]
contactTelephone = customer["contactTelephone"]
branchDisplayName = customer["branchDisplayName"]
property_link = "https://www.rightmove.co.uk/properties/" + str(property_id)
distances = self.get_distances_from_url(property_link)
self.writer_details.writerow([
property_id,
property_title,
published_date,
bedrooms,
bathrooms,
amount,
displayAddress,
str(distances),
latitude,
longitude,
contactTelephone,
branchDisplayName,
property_link,
])
self.file_details.flush()
print([
property_id,
property_title,
published_date,
bedrooms,
bathrooms,
amount,
displayAddress,
str(distances),
contactTelephone,
branchDisplayName,
property_link,
])
# get distances from a property url
def get_distances_from_url(self, url):
site = requests.get(url, headers=headers).text
soup = BeautifulSoup(site, "html.parser")
stations_div = soup.find("div", {"id": "Stations-panel"})
distance_dict = {}
if stations_div:
stations_lis = stations_div.find_all("li")
for stations_li in stations_lis:
spans = stations_li.find_all("span")
if spans != []:
span_station_name = spans[0].text
span_distance = spans[1].text
distance_dict[span_station_name] = span_distance
return distance_dict
# converting string to json
def from_string_to_json(self, json_script):
json_script = json_script.replace("window.jsonModel = ", "")
json_data = json.loads(json.loads(json.dumps(json_script)))
return json_data
# this is for test
if __name__ == "__main__":
RightMoveScrapper("https://www.rightmove.co.uk/property-for-sale/find.html?searchType=SALE&locationIdentifier=REGION%5E87490&insId=1&radius=0.0&minPrice=&maxPrice=&minBedrooms=&maxBedrooms=&displayPropertyType=&maxDaysSinceAdded=&_includeSSTC=on&sortByPriceDescending=&primaryDisplayPropertyType=&secondaryDisplayPropertyType=&oldDisplayPropertyType=&oldPrimaryDisplayPropertyType=&newHome=&auction=false")