Skip to content
This repository has been archived by the owner on Oct 17, 2021. It is now read-only.

Commit

Permalink
Angga/enrich courses data for ulas kelas (#4)
Browse files Browse the repository at this point in the history
* feat: scrape desc and prereq

* feat: enrich serialize course

* feat: handle max len desc

* feat: finish scrape desc and prereq

Co-authored-by: Muhammad Erlangga <[email protected]>
  • Loading branch information
angga1518 and Muhammad Erlangga authored Sep 10, 2021
1 parent 57c0a2b commit 4959067
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 11 deletions.
3 changes: 2 additions & 1 deletion backend/app/services/scrapper/schedule_scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from models.major import Major
from models.period import Period
from models.user import User
from scraper.main import scrape_courses_with_credentials, AUTH_URL
from scraper.main import scrape_courses_with_credentials, AUTH_URL, generate_desc_prerequisite


class ScheduleScrapperServices:
Expand Down Expand Up @@ -47,6 +47,7 @@ def callback(ch, method, properties, body):
period.last_update_at = now
period.save()
app.logger.info(f"Done scrapping kd_org: {method.routing_key}; period: {active_period}; at: {now} UTC")
generate_desc_prerequisite(period, username, password)

channel.basic_consume(
queue=queue_name, on_message_callback=callback, auto_ack=True
Expand Down
19 changes: 9 additions & 10 deletions backend/scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def scrape_courses_with_credentials(period, username, password):
r = req.get(CHANGEROLE_URL)
r = req.get(DETAIL_SCHEDULE_URL.format(period=period))
courses = create_courses(r.text, is_detail=True)
generate_desc_prerequisite(courses, req)
return courses


Expand Down Expand Up @@ -95,12 +94,14 @@ def get_period_and_kd_org(html):

return None, None

def generate_desc_prerequisite(courses, req):
print("=== generating desc and prereq ===")
now = datetime.datetime.now()
for course in courses:
html = req.get(DETAIL_COURSES_URL.format(course=course.course_code, curr=course.curriculum)).text
soup = BeautifulSoup(html, 'html.parser')
def generate_desc_prerequisite(period, username, password):
req = requests.Session()
r = req.post(AUTH_URL, data={'u': username,
'p': password}, verify=False)
r = req.get(CHANGEROLE_URL)
for course in period.courses:
r = req.get(DETAIL_COURSES_URL.format(course=course.course_code, curr=course.curriculum)).text
soup = BeautifulSoup(r, 'html.parser')
for textarea in soup.findAll('textarea'):
if textarea.contents:
textarea_content = textarea.contents[0]
Expand All @@ -118,9 +119,7 @@ def generate_desc_prerequisite(courses, req):
prerequisites += p.group().strip() + ","
course.description = desc
course.prerequisite = prerequisites[:-1]
end = datetime.datetime.now()
print("time elapsed ms :: "+ str((end-now).microseconds))
print("time elapsed s :: "+ str((end-now).seconds))
period.save()

def create_courses(html, is_detail=False):
soup = BeautifulSoup(html, 'html.parser')
Expand Down

0 comments on commit 4959067

Please sign in to comment.