Merge pull request #588 from tanjiarui15/fix-parsing.py-for-edx.org

fixes #587
coursera-dl · Feb 21, 2020 · 25cbdee · 25cbdee
2 parents 492780c + 505e605
commit 25cbdee
Showing 1 changed file with 6 additions and 6 deletions.
diff --git a/edx_dl/parsing.py b/edx_dl/parsing.py
@@ -369,32 +369,32 @@ def extract_sections_from_html(self, page, BASE_URL):
         """
         def _make_url(section_soup):  # FIXME: Extract from here and test
             try:
-                return None
+                return section_soup.a['href']
             except AttributeError:
                 # Section might be empty and contain no links
                 return None
 
         def _get_section_name(section_soup):  # FIXME: Extract from here and test
             try:
-                return section_soup.button.h3.string.strip()
+                return section_soup.a.h4.string.strip()
             except AttributeError:
                 return None
 
         def _make_subsections(section_soup):
             try:
-                subsections_soup = section_soup.select("li.vertical.outline-item.focusable")
+                subsections_soup = section_soup.find_all('li', class_=['subsection'])
             except AttributeError:
                 return []
             # FIXME correct extraction of subsection.name (unicode)
             subsections = [SubSection(position=i,
                                       url=s.a['href'],
-                                      name=s.a.div.div.string.strip())
+                                      name=s.a.h4.string.strip())
                            for i, s in enumerate(subsections_soup, 1)]
 
             return subsections
 
         soup = BeautifulSoup(page)
-        sections_soup = soup.select("li.outline-item.section")
+        sections_soup = soup.find_all('li', class_=['outline-item section'])
 
         sections = [Section(position=i,
                             name=_get_section_name(section_soup),
@@ -422,7 +422,7 @@ def get_page_extractor(url):
         url.startswith('https://lagunita.stanford.edu') or
         url.startswith('https://www.fun-mooc.fr')
     ):
-        return CurrentEdXPageExtractor()
+        return NewEdXPageExtractor()
     else:
         return ClassicEdXPageExtractor()