forked from evanye/cal-dining
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
99 lines (79 loc) · 3.27 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from api import db
from util import http_get, get_date
from models import Food, Menu, LOCATION_TO_ENUM, MEAL_TO_ENUM
BASE_URL = 'http://services.housing.berkeley.edu/FoodPro/dining/static/'
MENU_URL = 'todaysentrees.asp'
MEALS = {'breakfast':3, 'lunch': 5, 'dinner': 7}
DINING_COMMONS = {'crossroads':1, 'cafe3': 3, 'foothill': 5, 'clarkkerr': 7}
VEGE_LEGEND = {'vegan':'#800040', 'vegetarian':'#008000'}
def crawl():
# make web request
soup = http_get(BASE_URL + MENU_URL)
# locate html data
html = soup.body.contents[-2].table.tbody.contents[3].td.table.contents
# stores food that has already been added to the table
food_cache = {}
# extract data
for MEAL in MEALS:
meal_index = MEALS[MEAL]
meal_data = html[meal_index]
for DINING_COMMON in DINING_COMMONS:
dc_index = DINING_COMMONS[DINING_COMMON]
if len(meal_data.contents) <= dc_index:
break
meal_dc_data = meal_data.contents[dc_index]
for entry in meal_dc_data.find_all('a'):
meal_name = entry.contents[0].string
meal_name, gluten_free = truncate_meal_name(meal_name)
# skip the "Nutritive Analysis" link
if 'nutritive analysis' in meal_name.lower():
continue
# create database models object
if meal_name in food_cache:
food_obj = food_cache[meal_name]
else: # food is not located in local cache
# check if food is in database
food_obj = Food.query.filter_by(name=meal_name).first()
# not found in database, crawl page
if food_obj is None:
food_obj = extract_food_info(entry)
db.session.add(food_obj)
# add food to the cache
food_cache[meal_name] = food_obj
menu_obj = Menu(date = get_date(), location = LOCATION_TO_ENUM[DINING_COMMON], \
meal = MEAL_TO_ENUM[MEAL], food = food_obj)
db.session.add(menu_obj)
db.session.commit()
def extract_food_info(entry):
link = BASE_URL + entry['href']
meal_name = entry.contents[0].string
meal_name, gluten_free = truncate_meal_name(meal_name)
# check for vegan and vegetarian foods
vegan, vegetarian = False, False
if entry.font['color'] == VEGE_LEGEND['vegan']:
vegan = True
elif entry.font['color'] == VEGE_LEGEND['vegetarian']:
vegetarian = True
# crawl link (declared above) to check allergens
allergens, ingredients = get_allergens_and_ingredients(link)
return Food(name = meal_name, allergens = allergens, ingredients = ingredients, \
vegan = vegan, vegetarian = vegetarian, gluten_free = gluten_free)
def get_allergens_and_ingredients(link):
html = http_get(link)
html = html.find_all(face="arial", size="2")[-2:]
# edge case where no nutrition info is avaiable
if len(html) <= 1:
return "n/a", "n/a"
allergens = html[0].contents[1].string if len(html[0].contents) > 1 else ""
ingredients = html[1].contents[1].string if len(html[1].contents) > 1 else ""
return allergens, ingredients
# truncate Honey Bear or Gluten Free prefix
# returns pair of meal name, and if its gluten free or not
def truncate_meal_name(meal_name):
gluten_free = False
if meal_name[0:3] == 'HB ':
meal_name = meal_name[3:]
elif meal_name[0:3] == 'GF ':
meal_name = meal_name[3:]
gluten_free = True
return meal_name, gluten_free