-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapping.py
140 lines (95 loc) · 4.89 KB
/
scrapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from bs4 import BeautifulSoup
import urllib2
import requests
from user_agent import generate_user_agent
import threading
import urllib
import time
from doctor import Doctor
'''
ScraphDoctorInfo(CLASS):
used to scrap the information for the target website.
A 3-Layer process is followed to scrap the entire data
Layer 1 - Fetch the cities in New Jersey
Layer 2 - Fetch Specialization per City
Layer 3 - Fetch Doctors per Specilization per City
In Layer 3 the following doctor's data is sent to the
Elastic Search using Elasticsearch Python Api version 6.2.4
'''
class ScrapDoctorInfo(object):
city_specialization_map = {}
def __init__(self, city):
self.parent_domain = "https://health.usnews.com"
self.url = self.parent_domain + "/doctors/city-index/" + city
self.city = city
# Scrapes the urls of the cities in New Jersey
def scrap_cities(self):
# Declaring a user-agent header in order to get permission to access the site
headers = {'User-Agent': generate_user_agent(device_type="desktop", os=('mac', 'linux'))}
citypageresponse = requests.get(self.url, timeout = 5, headers = headers)
print(citypageresponse.status_code)
# Creating a Html Parsed Soup in order to access html tags
soup = BeautifulSoup(citypageresponse.text, 'lxml')
# Parsing the URLs for next level of Scrapping
city_urls = []
for link in soup.find_all('a'):
if str(link.get("href")).__contains__("/doctors/specialists-index"):
city_urls.append(self.parent_domain + link.get("href"))
self.city_specialization_map[link.get("href").split("/")[-1]] = []
return city_urls
# A multi-threaded module that divides the cities into factors of 4 and
# Perform a divide and conquer strategy to explore the depths of the website
def multithreadScrap(self,urls,depth,targetFunction):
threading_frequency = 1
for i in range(0,depth - 3):
t1 = threading.Thread(target = targetFunction, name = "t1", args = (urls[i],))
t2 = threading.Thread(target = targetFunction, name = "t2", args = (urls[i+1],))
t3 = threading.Thread(target = targetFunction, name = "t3", args = (urls[i+2],))
t4 = threading.Thread(target = targetFunction, name = "t4", args = (urls[i+3],))
t1.start()
t2.start()
t3.start()
t4.start()
t1.join()
t2.join()
t3.join()
t4.join()
time.sleep(threading_frequency)
# Initialized for multi-threading and url division
def fetchDoctorInfo(self, Map, depth = 13):
print("here")
for key in Map.keys():
if(Map[key] != []):
self.multithreadScrap(Map[key],depth,self.doctorSoup)
return self.city_specialization_map
#Here we visit the page with list of doctors - Screenshot 4
def doctorSoup(self, url):
print("Starting a Thread ", threading.current_thread().name)
headers = {'User-Agent': generate_user_agent(device_type="desktop", os=('mac', 'linux'))}
# FORM A URL
doctor_pageresponse = requests.get(self.parent_domain + url, timeout = 5, headers = headers)
print(doctor_pageresponse.status_code)
soup = BeautifulSoup(doctor_pageresponse.text,"lxml")
temp_city = url.split("/")[-1]
for a in soup.find_all("li", {"data-view":"dr-search-card"}):
doc_dom = []
for x in a.find_all("a"):
if(x.get("href") not in doc_dom):
doc_dom.append(x.get("href"))
doc_url = self.parent_domain + doc_dom[0]
doctor = Doctor(doc_url)
doctor.buildDoctor(temp_city)
def scrap_doctor_specialization(self, urls, depth = 13):
self.multithreadScrap(urls,depth,self.specializationSoup)
return self.city_specialization_map
def specializationSoup(self, url):
print("Starting a Thread ", threading.current_thread().name)
headers = {'User-Agent': generate_user_agent(device_type="desktop", os=('mac', 'linux'))}
specialization_pageresponse = requests.get(url, timeout = 5, headers = headers)
soup = BeautifulSoup(specialization_pageresponse.text,"lxml")
for link in soup.find_all("a"):
if (str(link.get("href")).__contains__("/" + self.city + "/" + url.split("/")[-1])):
if((link.get("href") in self.city_specialization_map[url.split("/")[-1]])):
pass
else :
self.city_specialization_map[url.split("/")[-1]].append(link.get("href")) # create a map cities and the next level of URLS