-
Notifications
You must be signed in to change notification settings - Fork 0
/
collegeScraper.py
executable file
·61 lines (53 loc) · 1.77 KB
/
collegeScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python
from bs4 import BeautifulSoup, Comment
import requests
import urllib
import json
srcUrl = "http://collegestats.org/colleges/all/?pg="
pageNumber = 1
schoolListings = {}
def testWorks():
global schoolListings
global pageNumber
while pageNumber < 302:
try:
url = srcUrl + str(pageNumber)
request = requests.get(url)
data = request.text
soup = BeautifulSoup(data, "html.parser")
comments = soup.find_all(string=lambda text:isinstance(text, Comment))
for comment in comments:
if "http://schema.org/CollegeOrUniversity" in comment:
try:
comment = BeautifulSoup(str(comment), "html.parser")
name = comment.find('a').text.rstrip().lstrip()
schoolInfo = comment.find_all('meta')
try:
webpage = schoolInfo[4]['content']
except IndexError:
webpage = ""
schoolAddress = schoolInfo[0]['content'] + ", " + schoolInfo[1]['content'] + ", " + schoolInfo[2]['content'] + ", " + schoolInfo[3]['content']
schoolListings[name] = {"physicalAddress": schoolAddress, "url": webpage}
except UnicodeEncodeError:
x = 0
pageNumber += 1
except requests.exceptions.ConnectionError:
x = 0
def works():
global schoolListings
global pageNumber
while pageNumber < 297:
url = srcUrl + str(pageNumber)
request = requests.get(url)
data = request.text
soup = BeautifulSoup(data, "html.parser")
for schoolSection in soup.findAll('div', {'class': 'school-listing'}):
schoolAddress = []
name = schoolSection.find('p').text
for metaAddress in schoolSection.findAll('meta'):
schoolAddress.append(metaAddress['content'])
schoolListings[name] = {"address": ", ".join(schoolAddress)}
pageNumber += 1
testWorks()
#print schoolListings
print json.dumps(schoolListings)