Skip to content

Commit

Permalink
Merge pull request #588 from tanjiarui15/fix-parsing.py-for-edx.org
Browse files Browse the repository at this point in the history
fixes #587
  • Loading branch information
balta2ar authored Feb 21, 2020
2 parents 492780c + 505e605 commit 25cbdee
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions edx_dl/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,32 +369,32 @@ def extract_sections_from_html(self, page, BASE_URL):
"""
def _make_url(section_soup): # FIXME: Extract from here and test
try:
return None
return section_soup.a['href']
except AttributeError:
# Section might be empty and contain no links
return None

def _get_section_name(section_soup): # FIXME: Extract from here and test
try:
return section_soup.button.h3.string.strip()
return section_soup.a.h4.string.strip()
except AttributeError:
return None

def _make_subsections(section_soup):
try:
subsections_soup = section_soup.select("li.vertical.outline-item.focusable")
subsections_soup = section_soup.find_all('li', class_=['subsection'])
except AttributeError:
return []
# FIXME correct extraction of subsection.name (unicode)
subsections = [SubSection(position=i,
url=s.a['href'],
name=s.a.div.div.string.strip())
name=s.a.h4.string.strip())
for i, s in enumerate(subsections_soup, 1)]

return subsections

soup = BeautifulSoup(page)
sections_soup = soup.select("li.outline-item.section")
sections_soup = soup.find_all('li', class_=['outline-item section'])

sections = [Section(position=i,
name=_get_section_name(section_soup),
Expand Down Expand Up @@ -422,7 +422,7 @@ def get_page_extractor(url):
url.startswith('https://lagunita.stanford.edu') or
url.startswith('https://www.fun-mooc.fr')
):
return CurrentEdXPageExtractor()
return NewEdXPageExtractor()
else:
return ClassicEdXPageExtractor()

Expand Down

0 comments on commit 25cbdee

Please sign in to comment.