Skip to content

Commit

Permalink
Merge pull request #71 from Arc-rendezvous/issue-19
Browse files Browse the repository at this point in the history
Fix #19
  • Loading branch information
gunawanlg authored Apr 10, 2020
2 parents 04b3d44 + 8e871c1 commit 4e8e306
Show file tree
Hide file tree
Showing 3 changed files with 1,106 additions and 101 deletions.
154 changes: 117 additions & 37 deletions gurih/data/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import pandas as pd
import urllib
from selenium import webdriver
# from selenium.common.exceptions import NoSuchElementException
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
Expand Down Expand Up @@ -62,18 +61,28 @@ class BibleIsScraper:
def __init__(self, base_url, driver_path, output_dir='../../dataset/raw/bibleis/'):
self.base_url = base_url
self.driver_path = driver_path
self.output_dir = output_dir
self.output_dir = output_dir if output_dir[-1] == '/' else output_dir + '/'
self.data = []
self.urls = []
self.scrape_text = True
self.scrape_audio = True
self.debug = []

# Get inferred version from base_url
if 'INDASV' in base_url:
version = 'INDASV'
elif 'INDWBT' in base_url:
version = 'INDWBT'
else:
raise ValueError("Base url version not supported."
" Required either INDASV or INDWBT version")
self.version = version

if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
print("Output directory created at " + self.output_dir)
print("Output directory created at " + self.output_dir + "\n")
else:
print("Output directory is already created at " + self.output_dir)
print("Output directory is already created at " + self.output_dir + "\n")

print("Scrape text: " + str(self.scrape_text))
print("Scrape audio: " + str(self.scrape_audio))
Expand Down Expand Up @@ -143,48 +152,22 @@ def scrape_page(self, url):
driver = webdriver.Chrome(self.driver_path)
driver.get(url)

chapter_string = ''
audio_title = ''
chapter_string = ''

if self.scrape_text:
# Get all verses
chapter_section = driver.find_element_by_css_selector(".chapter")
ps = chapter_section.find_elements_by_css_selector("p")

verses = []
if len(ps) != 0:
for p in ps: # not including the chapter number
p_text = p.get_attribute("innerHTML") # get all text

# Find disconnected verse, join it
hanging_verse_idx = p_text.find('<')
if hanging_verse_idx != 0:
hanging_verse = p_text[:hanging_verse_idx]
self.debug.append(f"{url} {hanging_verse}")
if len(verses) == 0: # handle occurence in first <p>
verses.append(hanging_verse)
else:
last_verse = verses.pop()
verses.append(last_verse + " " + hanging_verse)

other_verses = p.find_elements_by_css_selector(".v")
other_verses = [v.get_attribute("innerHTML") for v in other_verses]
verses.extend(other_verses)
# handle chapter not having any p element
else:
other_verses = chapter_section.find_elements_by_css_selector(".v")
other_verses = [v.get_attribute("innerHTML") for v in other_verses]
verses.extend(other_verses)

chapter_string = '\n\n'.join(verses)
if self.version == 'INDASV':
chapter_string = self._scrape_text_indasv(driver, url)
elif self.version == 'INDWBT':
chapter_string = self._scrape_text_indwbt(driver, url)

if self.scrape_audio:
# Get audio file attributes
audio = driver.find_element_by_css_selector(".audio-player")
audio_src = audio.get_attribute("src")
audio_title = re.search("[^?]*", url[28:]).group() + ".mp3"
audio_title = audio_title.replace("/", "_")
audio_title = self.output_dir + audio_title
audio_title = self.output_dir + "audio/" + audio_title
response = urllib.request.urlopen(audio_src)

with open(audio_title, "wb") as f:
Expand All @@ -200,7 +183,7 @@ def to_dataframe(self):

def write_csv(self, filename=None):
if filename is None:
filename = self.output_dir + 'bibleis_transcription.csv'
filename = self.output_dir + 'transcript.csv'

df = self.to_dataframe()
self._check_null_df(df)
Expand Down Expand Up @@ -229,3 +212,100 @@ def _check_null_df(self, df):
sum_null_audio = df['audio_title'].isnull().sum()
if sum_null_audio > 0:
raise ValueError(f"Found {sum_null_audio} null values in audio_title column.")

def _scrape_text_indasv(self, driver, url):
chapter_string = ''

# Get all verses
cv = self.__get_chapter(url)
chapter_section = driver.find_element_by_css_selector(".chapter")
css_pattern = f"p[data-id^={cv}], span[data-id^={cv}], div[data-id^={cv}]"
data = chapter_section.find_elements_by_css_selector(css_pattern)

verses = []
for d in data:
d_text = d.get_attribute("innerHTML")
idx = d_text.find('<') # get all innerHTML until the first '<'
if idx != 0:
d_text = d_text[:idx]
verses.extend([d_text])

chapter_string = '\n\n'.join(verses)

# Clean <span> with class="note"
chapter_string = re.sub('<span class="note".+span>', '', chapter_string)

return chapter_string

def _scrape_text_indasv_old(self, driver, url):
chapter_string = ''

# Get all verses
chapter_section = driver.find_element_by_css_selector(".chapter")
ps = chapter_section.find_elements_by_css_selector("p")

verses = []
if len(ps) != 0:
for p in ps: # not including the chapter number
p_text = p.get_attribute("innerHTML") # get all text

# Find disconnected verse, join it
hanging_verse_idx = p_text.find('<')
if hanging_verse_idx != 0:
hanging_verse = p_text[:hanging_verse_idx]
self.debug.append(f"{url} {hanging_verse}")
if len(verses) == 0: # handle occurence in first <p>
verses.append(hanging_verse)
else:
last_verse = verses.pop()
verses.append(last_verse + " " + hanging_verse)

other_verses = p.find_elements_by_css_selector(".v")
other_verses = [v.get_attribute("innerHTML") for v in other_verses]
verses.extend(other_verses)
# handle chapter not having any p element
else:
other_verses = chapter_section.find_elements_by_css_selector(".v")
other_verses = [v.get_attribute("innerHTML") for v in other_verses]
verses.extend(other_verses)

chapter_string = '\n\n'.join(verses)

return chapter_string

def _scrape_text_indwbt(self, driver, url):
chapter_string = ''

# Get all verses
cv = self.__get_chapter(url)
chapter_section = driver.find_element_by_css_selector(".chapter")
css_pattern = f"p[data-id^={cv}], span[data-id^={cv}], div[data-id^={cv}]"
data = chapter_section.find_elements_by_css_selector(css_pattern)

verses = []
for d in data:
d_text = d.get_attribute("innerHTML")
idx = d_text.find('<') # get all innerHTML until the first '<'
if idx != 0:
d_text = d_text[:idx]
verses.extend([d_text])

chapter_string = '\n\n'.join(verses)

# Clean <span> with class="note"
chapter_string = re.sub('<span class="note".+span>', '', chapter_string)

return chapter_string

@staticmethod
def __get_chapter(url):
"""
"https://live.bible.is/bible/INDWBT/MAT/1?audio_type=audio" --> "MAT1"
"""
cv_pattern = re.search("[^?]*", url[35:]).group().replace("/", '')

# In case of starting with digit, WTF??
if cv_pattern[0] in ['1', '2', '3']:
cv_pattern = r'\3' + cv_pattern[0] + ' ' + cv_pattern[1:]

return cv_pattern
Loading

0 comments on commit 4e8e306

Please sign in to comment.