From 8e375243c0a9887ecb25dbfe5d9abe50f1bb0b80 Mon Sep 17 00:00:00 2001 From: "Gunawan Lumban Gaol (ID)" Date: Thu, 27 Feb 2020 13:28:15 +0700 Subject: [PATCH 1/5] add INDWBT compat in BibleisScraper --- gurih/data/scraper.py | 120 ++++++++---- notebooks/eda/1.0-glg-scrape-bibleis.ipynb | 206 +++++++++++++++++---- 2 files changed, 252 insertions(+), 74 deletions(-) diff --git a/gurih/data/scraper.py b/gurih/data/scraper.py index 7283462..85eaddb 100644 --- a/gurih/data/scraper.py +++ b/gurih/data/scraper.py @@ -62,18 +62,28 @@ class BibleIsScraper: def __init__(self, base_url, driver_path, output_dir='../../dataset/raw/bibleis/'): self.base_url = base_url self.driver_path = driver_path - self.output_dir = output_dir + self.output_dir = output_dir if output_dir[-1] == '/' else output_dir + '/' self.data = [] self.urls = [] self.scrape_text = True self.scrape_audio = True self.debug = [] + # Get inferred version from base_url + if 'INDASV' in base_url: + version = 'INDASV' + elif 'INDWBT' in base_url: + version = 'INDWBT' + else: + raise ValueError("Base url version not supported." + " Required either INDASV or INDWBT version") + self.version = version + if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) - print("Output directory created at " + self.output_dir) + print("Output directory created at " + self.output_dir + "\n") else: - print("Output directory is already created at " + self.output_dir) + print("Output directory is already created at " + self.output_dir + "\n") print("Scrape text: " + str(self.scrape_text)) print("Scrape audio: " + str(self.scrape_audio)) @@ -143,40 +153,13 @@ def scrape_page(self, url): driver = webdriver.Chrome(self.driver_path) driver.get(url) - chapter_string = '' audio_title = '' if self.scrape_text: - # Get all verses - chapter_section = driver.find_element_by_css_selector(".chapter") - ps = chapter_section.find_elements_by_css_selector("p") - - verses = [] - if len(ps) != 0: - for p in ps: # not including the chapter number - p_text = p.get_attribute("innerHTML") # get all text - - # Find disconnected verse, join it - hanging_verse_idx = p_text.find('<') - if hanging_verse_idx != 0: - hanging_verse = p_text[:hanging_verse_idx] - self.debug.append(f"{url} {hanging_verse}") - if len(verses) == 0: # handle occurence in first

- verses.append(hanging_verse) - else: - last_verse = verses.pop() - verses.append(last_verse + " " + hanging_verse) - - other_verses = p.find_elements_by_css_selector(".v") - other_verses = [v.get_attribute("innerHTML") for v in other_verses] - verses.extend(other_verses) - # handle chapter not having any p element - else: - other_verses = chapter_section.find_elements_by_css_selector(".v") - other_verses = [v.get_attribute("innerHTML") for v in other_verses] - verses.extend(other_verses) - - chapter_string = '\n\n'.join(verses) + if self.version == 'INDASV': + chapter_string = self._scrape_text_indasv(driver, url) + elif self.version == 'INDWBT': + chapter_string = self._scrape_text_indwbt(driver, url) if self.scrape_audio: # Get audio file attributes @@ -184,7 +167,7 @@ def scrape_page(self, url): audio_src = audio.get_attribute("src") audio_title = re.search("[^?]*", url[28:]).group() + ".mp3" audio_title = audio_title.replace("/", "_") - audio_title = self.output_dir + audio_title + audio_title = self.output_dir + "audio/" + audio_title response = urllib.request.urlopen(audio_src) with open(audio_title, "wb") as f: @@ -200,7 +183,7 @@ def to_dataframe(self): def write_csv(self, filename=None): if filename is None: - filename = self.output_dir + 'bibleis_transcription.csv' + filename = self.output_dir + 'transcript.csv' df = self.to_dataframe() self._check_null_df(df) @@ -229,3 +212,68 @@ def _check_null_df(self, df): sum_null_audio = df['audio_title'].isnull().sum() if sum_null_audio > 0: raise ValueError(f"Found {sum_null_audio} null values in audio_title column.") + + def _scrape_text_indasv(self, driver, url): + chapter_string = '' + + # Get all verses + chapter_section = driver.find_element_by_css_selector(".chapter") + ps = chapter_section.find_elements_by_css_selector("p") + + verses = [] + if len(ps) != 0: + for p in ps: # not including the chapter number + p_text = p.get_attribute("innerHTML") # get all text + + # Find disconnected verse, join it + hanging_verse_idx = p_text.find('<') + if hanging_verse_idx != 0: + hanging_verse = p_text[:hanging_verse_idx] + self.debug.append(f"{url} {hanging_verse}") + if len(verses) == 0: # handle occurence in first

+ verses.append(hanging_verse) + else: + last_verse = verses.pop() + verses.append(last_verse + " " + hanging_verse) + + other_verses = p.find_elements_by_css_selector(".v") + other_verses = [v.get_attribute("innerHTML") for v in other_verses] + verses.extend(other_verses) + # handle chapter not having any p element + else: + other_verses = chapter_section.find_elements_by_css_selector(".v") + other_verses = [v.get_attribute("innerHTML") for v in other_verses] + verses.extend(other_verses) + + chapter_string = '\n\n'.join(verses) + + return chapter_string + + def _scrape_text_indwbt(self, driver, url): + chapter_string = '' + + # Get all verses + cv_pattern = self.__get_chapter(url) + chapter_section = driver.find_element_by_css_selector(".chapter") + data = chapter_section.find_elements_by_css_selector( + f"span[data-id^={cv_pattern}], div[data-id^={cv_pattern}]" + ) + + verses = [] + for d in data: + d_text = d.get_attribute("innerHTML") + verses.extend([d_text]) + + chapter_string = '\n\n'.join(verses) + + # Clean with class="note" + chapter_string = re.sub('', '', chapter_string) + + return chapter_string + + @staticmethod + def __get_chapter(url): + """ + "https://live.bible.is/bible/INDWBT/MAT/1?audio_type=audio" --> "MAT1" + """ + return re.search("[^?]*", url[35:]).group().replace("/", '') diff --git a/notebooks/eda/1.0-glg-scrape-bibleis.ipynb b/notebooks/eda/1.0-glg-scrape-bibleis.ipynb index dc27097..9695846 100644 --- a/notebooks/eda/1.0-glg-scrape-bibleis.ipynb +++ b/notebooks/eda/1.0-glg-scrape-bibleis.ipynb @@ -13,13 +13,20 @@ "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language overning permissions and limitations under the License." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import Packages" + ] + }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 71, "metadata": { "ExecuteTime": { - "end_time": "2020-02-03T03:06:05.956500Z", - "start_time": "2020-02-03T03:06:05.665500Z" + "end_time": "2020-02-27T05:20:17.247804Z", + "start_time": "2020-02-27T05:20:16.989804Z" } }, "outputs": [ @@ -39,13 +46,40 @@ "%autoreload 2" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Configure Scraper" + ] + }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 72, "metadata": { "ExecuteTime": { - "end_time": "2020-02-03T03:06:06.173500Z", - "start_time": "2020-02-03T03:06:05.960500Z" + "end_time": "2020-02-27T05:20:18.400804Z", + "start_time": "2020-02-27T05:20:18.203804Z" + } + }, + "outputs": [], + "source": [ + "# INDASV version\n", + "# base_url = \"https://live.bible.is/bible/INDASV/MRK/1?audio_type=audio\"\n", + "# output_dir = \"../../dataset/raw/bibleis/INDASV/\"\n", + "\n", + "# INDWBT version\n", + "base_url = \"https://live.bible.is/bible/INDWBT/MRK/1?audio_type=audio\"\n", + "output_dir = \"../../dataset/raw/bibleis/INDWBT/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-27T05:20:18.618804Z", + "start_time": "2020-02-27T05:20:18.405804Z" } }, "outputs": [ @@ -53,7 +87,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Output directory is already created at ../../dataset/raw/bibleis/\n", + "Output directory is already created at ../../dataset/raw/bibleis/INDWBT/\n", + "\n", "Scrape text: True\n", "Scrape audio: True\n", "Edit the configuration by setting corresponding attributes.\n" @@ -61,8 +96,7 @@ } ], "source": [ - "base_url = \"https://live.bible.is/bible/INDASV/MRK/1?audio_type=audio\"\n", - "scraper = BibleIsScraper(base_url, './chromedriver.exe')" + "scraper = BibleIsScraper(base_url, './chromedriver.exe', output_dir=output_dir)" ] }, { @@ -74,16 +108,16 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 74, "metadata": { "ExecuteTime": { - "end_time": "2020-02-03T03:06:07.138500Z", - "start_time": "2020-02-03T03:06:06.939500Z" + "end_time": "2020-02-27T05:20:19.496804Z", + "start_time": "2020-02-27T05:20:19.312804Z" } }, "outputs": [], "source": [ - "scraper.scrape_audio = False\n", + "# scraper.scrape_audio = False\n", "# scarper.scrape_text = False" ] }, @@ -91,9 +125,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "# Single Page Test\n", + "\n", "Test `scrape_page` method, ensures expected return value." ] }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "## INDASV Version" + ] + }, { "cell_type": "code", "execution_count": 62, @@ -101,7 +146,8 @@ "ExecuteTime": { "end_time": "2020-02-03T03:07:02.281500Z", "start_time": "2020-02-03T03:06:07.839500Z" - } + }, + "hidden": true }, "outputs": [ { @@ -127,6 +173,48 @@ "scraper.scrape_page(test_url)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## INDWBT Version" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-27T05:21:52.438804Z", + "start_time": "2020-02-27T05:20:31.610804Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['https://live.bible.is/bible/INDWBT/MAT/2?audio_type=audio',\n", + " 'Yesus lahir di kota Betlehem di propinsi Yudea. Pada waktu itu Herodeslah yang memerintah sebagai raja Yudea di bawah pemerintahan kerajaan besar Roma. Kemudian datanglah beberapa orang peramal bintang dari negeri yang jauh di sebelah timur Yerusalem.\\n\\nMereka bertanya-tanya, “Di manakah Anak yang baru lahir itu— yaitu dia yang akan menjadi Raja orang Yahudi? Karena kami sudah melihat satu bintang muncul di sebelah timur yang adalah tanda kelahiran-Nya. Jadi kami pun datang untuk menyembah Dia.”\\n\\nKetika Raja Herodes mendengar berita tentang seorang raja orang Yahudi yang baru lahir itu, dia menjadi marah karena merasa tersaingi. Dan seluruh penduduk Yerusalem pun menjadi tidak tenang.\\n\\nLalu Herodes memanggil para imam kepala dan ahli Taurat dan bertanya kepada mereka di mana Kristus akan lahir.\\n\\nJawab mereka, “Di kota Betlehem, Yudea. Karena nabi yang sudah menuliskan perkataan Allah seperti ini,\\n\\n‘Hai penduduk Betlehem, di daerah Yehuda,\\n\\nsekarang kalian tidak lagi dianggap kota yang tidak penting di daerah itu!\\n\\nKarena di tengah-tengah kalian akan dilahirkan seorang pemimpin\\n\\nyang akan memimpin Israel, umat-Ku.’”\\n\\nKemudian Herodes mengadakan pertemuan tertutup bersama para peramal bintang itu. Dengan demikian dia mendapatkan keterangan yang tepat dari mereka tentang kapan bintang itu muncul pertama kali.\\n\\nKemudian dia menyuruh mereka ke Betlehem dan berkata, “Pergi dan carilah keterangan yang pasti tentang Anak itu. Dan setelah kalian menemukan Dia, beritahukanlah kepadaku, supaya saya pun datang menyembah-Nya.”\\n\\nSesudah pertemuan itu, para peramal bintang itu pun pergi. Saat mereka melihat lagi bintang itu sedang naik di sebelah timur, mereka sangat bersukacita. Lalu cahaya bintang itu mengarahkan mereka dan berhenti di atas rumah di mana Anak itu berada.\\n\\nLalu masuklah mereka ke dalam rumah, tempat di mana Anak itu berada. Mereka melihat Anak itu bersama Maria, ibu-Nya, lalu berlutut dan menyembah Anak itu. Mereka membuka kantong-kantongnya dan mempersembahkan berbagai hadiah kepada Anak itu— yaitu emas, kemenyan, dan mur.\\n\\nTetapi tidak lama kemudian, Allah memberitahu mereka melalui mimpi supaya tidak kembali kepada Herodes. Karena itu mereka pulang ke negeri mereka melalui jalan yang lain.\\n\\nSesudah para peramal bintang itu pergi, malaikat Tuhan datang kepada Yusuf dalam mimpi. Malaikat itu berkata, “Bangunlah! Segeralah bawa Anak itu dan ibu-Nya ke Mesir. Tinggallah di sana sampai aku datang lagi membawa pesan kepadamu untuk kembali. Karena Herodes berencana mencari Anak itu untuk membunuh Dia.”\\n\\nPada malam itu juga Yusuf bangun dan pergi menuju Mesir bersama Maria dan Yesus.\\n\\nMereka tinggal di Mesir sampai Herodes meninggal. Dengan demikian tepatlah apa yang dikatakan Allah melalui nabi-Nya, “Aku akan memanggil Anak-Ku keluar dari Mesir.”\\n\\nKetika Herodes tahu bahwa para peramal bintang itu sudah menipunya, dia menjadi sangat marah. Lalu dia memerintahkan tentara-tentaranya untuk membunuh semua anak laki-laki di kota Betlehem dan semua daerah di sekitarnya— yaitu anak-anak yang berumur dua tahun ke bawah, sesuai dengan keterangan dari para peramal bintang itu.\\n\\nLalu tepatlah apa yang dikatakan Allah melalui Nabi Yeremia,\\n\\n“Terdengar suara di kota Rama,\\n\\nyaitu suara-suara yang menangis dan ratapan karena rasa sedih yang sangat mendalam.\\n\\nRahel menangisi anak-anaknya,\\n\\ndan dia tidak mau dihibur,\\n\\nkarena anak-anaknya sudah mati.”\\n\\nSesudah Herodes meninggal, Yusuf dan keluarganya masih berada di negeri Mesir. Lalu malaikat Tuhan datang lagi kepadanya dalam mimpi.\\n\\nMalaikat itu berkata, “Bangunlah dan pergilah kembali ke tanah Israel bersama Yesus dan Maria! Karena orang-orang yang berusaha membunuh Dia sudah meninggal.”\\n\\nLalu Yusuf bangun dan langsung berangkat dengan mereka ke tanah Israel.\\n\\nTetapi waktu Yusuf mendengar bahwa yang menggantikan Raja Herodes adalah Arkelaus, dia takut kembali ke propinsi Yudea. Arkelaus adalah putra dari Raja Herodes sendiri. Lalu Yusuf mendapat petunjuk lagi melalui mimpi, sehingga dia membawa mereka ke propinsi Galilea.\\n\\nJadi mereka pergi ke kota yang bernama Nazaret dan tinggal di sana. Dengan demikian apa yang dikatakan Allah melalui para nabi ditepati— yaitu “Waktu Kristus datang, Dia akan disebut ‘Orang Nazaret.’”',\n", + " '../../dataset/raw/bibleis/INDWBT/audio/INDWBT_MAT_2.mp3']" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_url = \"https://live.bible.is/bible/INDWBT/MAT/2?audio_type=audio\"\n", + "scraper.scrape_page(test_url)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Get All Base Urls" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -136,11 +224,11 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 76, "metadata": { "ExecuteTime": { - "end_time": "2020-01-30T08:21:00.667000Z", - "start_time": "2020-01-30T08:20:29.061000Z" + "end_time": "2020-02-27T05:29:43.720804Z", + "start_time": "2020-02-27T05:28:41.786804Z" } }, "outputs": [], @@ -148,23 +236,32 @@ "scraper.get_urls()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assert total number of base urls:\n", + "1. INDASV version = `1189` (929 old + 260 new)\n", + "2. INDWBT version = `260` (260 new only)" + ] + }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 77, "metadata": { "ExecuteTime": { - "end_time": "2020-01-30T08:21:00.887000Z", - "start_time": "2020-01-30T08:21:00.671000Z" + "end_time": "2020-02-27T05:29:53.995804Z", + "start_time": "2020-02-27T05:29:53.799804Z" } }, "outputs": [ { "data": { "text/plain": [ - "1189" + "260" ] }, - "execution_count": 14, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } @@ -180,6 +277,22 @@ "For every page, get verses and mp3s." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Continue Scraping\n", + "\n", + "Continue Scraping from saved logs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## INDASV" + ] + }, { "cell_type": "code", "execution_count": 66, @@ -234,11 +347,35 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scraper.run(list(l_cont.values))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## INDWBT\n", + "\n", + "Currently not implemented." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scrape All" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-02-03T03:46:11.631000Z", - "start_time": "2020-02-03T03:13:23.761000Z" + "start_time": "2020-02-27T05:31:38.166Z" }, "scrolled": false }, @@ -249,33 +386,26 @@ "text": [ "Running scraper:\n", "Scrape text: True\n", - "Scrape audio: False\n" + "Scrape audio: True\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "301cea15c21a471bb76a4e6be134a4a5", + "model_id": "9fb5d2f93bb54b02ac3a97681b2679a6", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, max=149.0), HTML(value='')))" + "HBox(children=(FloatProgress(value=0.0, max=260.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] } ], "source": [ - "scraper.run(list(l_cont.values))" + "scraper.run()" ] }, { @@ -304,7 +434,7 @@ } ], "source": [ - "scraper.write_csv('../../dataset/raw/bibleis/bibleis_transcription_v2_add.csv')" + "scraper.write_csv()" ] }, { @@ -321,7 +451,7 @@ "outputs": [], "source": [ "import glob\n", - "mp3s = glob.glob(\"../../dataset/raw/*.mp3\")\n", + "mp3s = glob.glob(output_dir+\"audio/*.mp3\")\n", "\n", "l = [x for x in mp3s if (os.stat(x).st_size < 3e5)]\n", "smol_files = [\"https://live.bible.is/bible/\" + x[:-4].replace(\"_\", \"/\") + \"?audio_type=audio\" for x in l]\n", From 44a69ea2fca6ff6f4431a28c9ae0f50ad1155d98 Mon Sep 17 00:00:00 2001 From: "Gunawan Lumban Gaol (ID)" Date: Fri, 28 Feb 2020 16:13:06 +0700 Subject: [PATCH 2/5] fix css selector error --- gurih/data/scraper.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/gurih/data/scraper.py b/gurih/data/scraper.py index 85eaddb..200abb3 100644 --- a/gurih/data/scraper.py +++ b/gurih/data/scraper.py @@ -5,7 +5,6 @@ import pandas as pd import urllib from selenium import webdriver -# from selenium.common.exceptions import NoSuchElementException # from selenium.webdriver.common.by import By # from selenium.webdriver.support.ui import WebDriverWait # from selenium.webdriver.support import expected_conditions as EC @@ -253,11 +252,10 @@ def _scrape_text_indwbt(self, driver, url): chapter_string = '' # Get all verses - cv_pattern = self.__get_chapter(url) + cv = self.__get_chapter(url) chapter_section = driver.find_element_by_css_selector(".chapter") - data = chapter_section.find_elements_by_css_selector( - f"span[data-id^={cv_pattern}], div[data-id^={cv_pattern}]" - ) + css_pattern = f"p[data-id^={cv}], span[data-id^={cv}], div[data-id^={cv}]" + data = chapter_section.find_elements_by_css_selector(css_pattern) verses = [] for d in data: @@ -276,4 +274,10 @@ def __get_chapter(url): """ "https://live.bible.is/bible/INDWBT/MAT/1?audio_type=audio" --> "MAT1" """ - return re.search("[^?]*", url[35:]).group().replace("/", '') + cv_pattern = re.search("[^?]*", url[35:]).group().replace("/", '') + + # In case of starting with digit, WTF?? + if cv_pattern[0] in ['1', '2']: + cv_pattern = r'\3' + cv_pattern + + return cv_pattern From ec77265cabf37f924818305db3cf98fde586661c Mon Sep 17 00:00:00 2001 From: "Gunawan Lumban Gaol (ID)" Date: Mon, 2 Mar 2020 11:22:13 +0700 Subject: [PATCH 3/5] finish INDASV audio Need rescraping the transcription. --- gurih/data/scraper.py | 1 + notebooks/eda/1.0-glg-scrape-bibleis.ipynb | 476 +++++++++++++++--- .../eda/2.0-glg-preprocess-bibleis.ipynb | 46 +- 3 files changed, 442 insertions(+), 81 deletions(-) diff --git a/gurih/data/scraper.py b/gurih/data/scraper.py index 200abb3..4e4eff8 100644 --- a/gurih/data/scraper.py +++ b/gurih/data/scraper.py @@ -153,6 +153,7 @@ def scrape_page(self, url): driver.get(url) audio_title = '' + chapter_string = '' if self.scrape_text: if self.version == 'INDASV': diff --git a/notebooks/eda/1.0-glg-scrape-bibleis.ipynb b/notebooks/eda/1.0-glg-scrape-bibleis.ipynb index 9695846..0bb9a11 100644 --- a/notebooks/eda/1.0-glg-scrape-bibleis.ipynb +++ b/notebooks/eda/1.0-glg-scrape-bibleis.ipynb @@ -22,28 +22,43 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-02-27T05:20:17.247804Z", - "start_time": "2020-02-27T05:20:16.989804Z" + "end_time": "2020-02-29T12:31:26.936500Z", + "start_time": "2020-02-29T12:31:26.931500Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] + "outputs": [], + "source": [ + "import os\n", + "import re\n", + "import glob\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from gurih.data.scraper import BibleIsScraper" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-29T12:31:26.971500Z", + "start_time": "2020-02-29T12:31:26.946500Z" } - ], + }, + "outputs": [], "source": [ - "from gurih.data.scraper import BibleIsScraper\n", + "def urls_to_mp3s(urls):\n", + " return [re.search(\"[^?]*\", url[28:]).group().replace('/', '_') + \".mp3\" for url in urls]\n", "\n", - "%load_ext autoreload\n", - "%autoreload 2" + "def mp3s_to_urls(mp3s):\n", + " base_url = 'https://live.bible.is/bible/'\n", + " tail_url = \"?audio_type=audio\"\n", + " return [base_url+mp3[:-4].replace('_', '/')+tail_url for mp3 in mp3s]" ] }, { @@ -55,11 +70,11 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-02-27T05:20:18.400804Z", - "start_time": "2020-02-27T05:20:18.203804Z" + "end_time": "2020-02-29T12:31:29.114000Z", + "start_time": "2020-02-29T12:31:29.104000Z" } }, "outputs": [], @@ -69,17 +84,17 @@ "# output_dir = \"../../dataset/raw/bibleis/INDASV/\"\n", "\n", "# INDWBT version\n", - "base_url = \"https://live.bible.is/bible/INDWBT/MRK/1?audio_type=audio\"\n", + "base_url = \"https://live.bible.is/bible/INDWBT/MAT/1?audio_type=audio\"\n", "output_dir = \"../../dataset/raw/bibleis/INDWBT/\"" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2020-02-27T05:20:18.618804Z", - "start_time": "2020-02-27T05:20:18.405804Z" + "end_time": "2020-02-29T12:31:29.739000Z", + "start_time": "2020-02-29T12:31:29.706500Z" } }, "outputs": [ @@ -108,17 +123,17 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2020-02-27T05:20:19.496804Z", - "start_time": "2020-02-27T05:20:19.312804Z" + "end_time": "2020-02-29T12:31:32.127750Z", + "start_time": "2020-02-29T12:31:32.111500Z" } }, "outputs": [], "source": [ "# scraper.scrape_audio = False\n", - "# scarper.scrape_text = False" + "scraper.scrape_text = False" ] }, { @@ -175,36 +190,51 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "heading_collapsed": true + }, "source": [ "## INDWBT Version" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-02-27T05:21:52.438804Z", - "start_time": "2020-02-27T05:20:31.610804Z" - } + "end_time": "2020-02-28T07:57:19.357250Z", + "start_time": "2020-02-28T07:55:22.608250Z" + }, + "hidden": true }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "span[data-id^=\\31JN1], div[data-id^=\\31JN1]\n", + "span[data-id^=\\31JN1], div[data-id^=\\31JN1]\n", + "'span[data-id^=\\\\31JN1], div[data-id^=\\\\31JN1]'\n" + ] + }, { "data": { "text/plain": [ - "['https://live.bible.is/bible/INDWBT/MAT/2?audio_type=audio',\n", - " 'Yesus lahir di kota Betlehem di propinsi Yudea. Pada waktu itu Herodeslah yang memerintah sebagai raja Yudea di bawah pemerintahan kerajaan besar Roma. Kemudian datanglah beberapa orang peramal bintang dari negeri yang jauh di sebelah timur Yerusalem.\\n\\nMereka bertanya-tanya, “Di manakah Anak yang baru lahir itu— yaitu dia yang akan menjadi Raja orang Yahudi? Karena kami sudah melihat satu bintang muncul di sebelah timur yang adalah tanda kelahiran-Nya. Jadi kami pun datang untuk menyembah Dia.”\\n\\nKetika Raja Herodes mendengar berita tentang seorang raja orang Yahudi yang baru lahir itu, dia menjadi marah karena merasa tersaingi. Dan seluruh penduduk Yerusalem pun menjadi tidak tenang.\\n\\nLalu Herodes memanggil para imam kepala dan ahli Taurat dan bertanya kepada mereka di mana Kristus akan lahir.\\n\\nJawab mereka, “Di kota Betlehem, Yudea. Karena nabi yang sudah menuliskan perkataan Allah seperti ini,\\n\\n‘Hai penduduk Betlehem, di daerah Yehuda,\\n\\nsekarang kalian tidak lagi dianggap kota yang tidak penting di daerah itu!\\n\\nKarena di tengah-tengah kalian akan dilahirkan seorang pemimpin\\n\\nyang akan memimpin Israel, umat-Ku.’”\\n\\nKemudian Herodes mengadakan pertemuan tertutup bersama para peramal bintang itu. Dengan demikian dia mendapatkan keterangan yang tepat dari mereka tentang kapan bintang itu muncul pertama kali.\\n\\nKemudian dia menyuruh mereka ke Betlehem dan berkata, “Pergi dan carilah keterangan yang pasti tentang Anak itu. Dan setelah kalian menemukan Dia, beritahukanlah kepadaku, supaya saya pun datang menyembah-Nya.”\\n\\nSesudah pertemuan itu, para peramal bintang itu pun pergi. Saat mereka melihat lagi bintang itu sedang naik di sebelah timur, mereka sangat bersukacita. Lalu cahaya bintang itu mengarahkan mereka dan berhenti di atas rumah di mana Anak itu berada.\\n\\nLalu masuklah mereka ke dalam rumah, tempat di mana Anak itu berada. Mereka melihat Anak itu bersama Maria, ibu-Nya, lalu berlutut dan menyembah Anak itu. Mereka membuka kantong-kantongnya dan mempersembahkan berbagai hadiah kepada Anak itu— yaitu emas, kemenyan, dan mur.\\n\\nTetapi tidak lama kemudian, Allah memberitahu mereka melalui mimpi supaya tidak kembali kepada Herodes. Karena itu mereka pulang ke negeri mereka melalui jalan yang lain.\\n\\nSesudah para peramal bintang itu pergi, malaikat Tuhan datang kepada Yusuf dalam mimpi. Malaikat itu berkata, “Bangunlah! Segeralah bawa Anak itu dan ibu-Nya ke Mesir. Tinggallah di sana sampai aku datang lagi membawa pesan kepadamu untuk kembali. Karena Herodes berencana mencari Anak itu untuk membunuh Dia.”\\n\\nPada malam itu juga Yusuf bangun dan pergi menuju Mesir bersama Maria dan Yesus.\\n\\nMereka tinggal di Mesir sampai Herodes meninggal. Dengan demikian tepatlah apa yang dikatakan Allah melalui nabi-Nya, “Aku akan memanggil Anak-Ku keluar dari Mesir.”\\n\\nKetika Herodes tahu bahwa para peramal bintang itu sudah menipunya, dia menjadi sangat marah. Lalu dia memerintahkan tentara-tentaranya untuk membunuh semua anak laki-laki di kota Betlehem dan semua daerah di sekitarnya— yaitu anak-anak yang berumur dua tahun ke bawah, sesuai dengan keterangan dari para peramal bintang itu.\\n\\nLalu tepatlah apa yang dikatakan Allah melalui Nabi Yeremia,\\n\\n“Terdengar suara di kota Rama,\\n\\nyaitu suara-suara yang menangis dan ratapan karena rasa sedih yang sangat mendalam.\\n\\nRahel menangisi anak-anaknya,\\n\\ndan dia tidak mau dihibur,\\n\\nkarena anak-anaknya sudah mati.”\\n\\nSesudah Herodes meninggal, Yusuf dan keluarganya masih berada di negeri Mesir. Lalu malaikat Tuhan datang lagi kepadanya dalam mimpi.\\n\\nMalaikat itu berkata, “Bangunlah dan pergilah kembali ke tanah Israel bersama Yesus dan Maria! Karena orang-orang yang berusaha membunuh Dia sudah meninggal.”\\n\\nLalu Yusuf bangun dan langsung berangkat dengan mereka ke tanah Israel.\\n\\nTetapi waktu Yusuf mendengar bahwa yang menggantikan Raja Herodes adalah Arkelaus, dia takut kembali ke propinsi Yudea. Arkelaus adalah putra dari Raja Herodes sendiri. Lalu Yusuf mendapat petunjuk lagi melalui mimpi, sehingga dia membawa mereka ke propinsi Galilea.\\n\\nJadi mereka pergi ke kota yang bernama Nazaret dan tinggal di sana. Dengan demikian apa yang dikatakan Allah melalui para nabi ditepati— yaitu “Waktu Kristus datang, Dia akan disebut ‘Orang Nazaret.’”',\n", - " '../../dataset/raw/bibleis/INDWBT/audio/INDWBT_MAT_2.mp3']" + "['https://live.bible.is/bible/INDWBT/1JN/1?audio_type=audio',\n", + " 'Kami ingin memberitakan kepada kalian tentang Dia yang disebut Firman— yaitu Dia yang memberikan hidup kepada kita dan yang sudah ada sebelum dunia diciptakan. Kami sudah mendengar dan melihat Dia dengan mata kami sendiri. Dan sungguh, kami sudah melihat Dia— bahkan kami sudah memegang Dia dengan tangan kami sendiri.\\n\\nBenar, Dia yang memberikan hidup itu sudah dinyatakan kepada kami, dan kami mendapat banyak kesempatan untuk melihat Dia. Sekarang kami bersaksi dan memberitakan kepada kalian tentang Dia yang memberi hidup yang selama-lamanya— yaitu Dia yang dari sejak semula sudah tinggal bersama Allah Bapa dan yang sudah dinyatakan oleh Bapa kepada kami.\\n\\nJadi, apa yang sudah kami lihat dan dengar, itulah yang kami beritakan kepada kalian, supaya kalian berhubungan dekat dengan kami dalam persekutuan kita. Dan bukan saja kita saling berhubungan, tetapi kita berhubungan dekat juga dengan Bapa dan Anak-Nya— yaitu Kristus Yesus.\\n\\nOleh karena itu, kami menulis surat ini kepada kalian, supaya melalui hubungan yang dekat itu sukacita kita semua akan sempurna.\\n\\nDan inilah berita yang sudah kami dengar dari Anak Allah dan yang sedang kami beritakan kepada kalian: Allah itu seperti terang, dan sama sekali tidak ada kegelapan di dalam Dia.\\n\\nJadi kalau kita berkata bahwa kita dengan Allah mempunyai hubungan yang erat, padahal kita masih hidup seperti orang yang tinggal di dalam kegelapan, berarti kita berbohong dan tidak menjalankan ajaran benar dari Allah.\\n\\nTetapi, kalau kita hidup di dalam terang sama seperti Allah hidup di dalam terang, kita dengan saudara-saudari seiman kita akan tetap mempunyai hubungan yang erat, dan darah Yesus— yang adalah Anak Allah, tetap menyucikan kita dari setiap dosa kita.\\n\\nKalau ada saudara kita yang berkata, “Saya tidak berdosa,” berarti dia tidak menerima ajaran yang benar dari Allah dan dia sudah menipu dirinya sendiri.\\n\\nTetapi kalau kita mengakui dosa-dosa kita kepada Allah, maka sesuai dengan janji-Nya, Allah yang sangat adil dan setia itu pasti mengampuni kita dan membersihkan hati kita dari setiap perbuatan jahat yang sudah kita lakukan.\\n\\nKalau ada saudara kita yang berkata, “Saya tidak pernah berbuat dosa,” berarti sama saja dia menganggap Allah sebagai Pembohong, dan ternyata ajaran yang benar dari Allah tidak mendapat tempat di dalam hatinya.',\n", + " '../../dataset/raw/bibleis/INDWBT/audio/INDWBT_1JN_1.mp3']" ] }, - "execution_count": 75, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "test_url = \"https://live.bible.is/bible/INDWBT/MAT/2?audio_type=audio\"\n", + "# test_url = \"https://live.bible.is/bible/INDWBT/MAT/2?audio_type=audio\"\n", + "# test_url = \"https://live.bible.is/bible/INDWBT/1CO/10?audio_type=audio\"\n", + "# test_url = \"https://live.bible.is/bible/INDWBT/1CO/11?audio_type=audio\"\n", + "test_url = \"https://live.bible.is/bible/INDWBT/1JN/1?audio_type=audio\"\n", "scraper.scrape_page(test_url)" ] }, @@ -224,11 +254,11 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2020-02-27T05:29:43.720804Z", - "start_time": "2020-02-27T05:28:41.786804Z" + "end_time": "2020-02-29T12:32:41.222500Z", + "start_time": "2020-02-29T12:31:36.349000Z" } }, "outputs": [], @@ -247,11 +277,11 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2020-02-27T05:29:53.995804Z", - "start_time": "2020-02-27T05:29:53.799804Z" + "end_time": "2020-02-29T14:31:38.707000Z", + "start_time": "2020-02-29T14:31:38.698250Z" } }, "outputs": [ @@ -261,7 +291,7 @@ "260" ] }, - "execution_count": 77, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -270,13 +300,6 @@ "len(scraper.urls)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For every page, get verses and mp3s." - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -288,7 +311,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "heading_collapsed": true + }, "source": [ "## INDASV" ] @@ -300,7 +325,8 @@ "ExecuteTime": { "end_time": "2020-02-03T03:08:04.210500Z", "start_time": "2020-02-03T03:08:03.903500Z" - } + }, + "hidden": true }, "outputs": [ { @@ -315,7 +341,7 @@ } ], "source": [ - "df_res_continue = pd.read_csv(\"../../dataset/raw/bibleis/bibleis_transcription_v2.csv\")\n", + "df_res_continue = pd.read_csv(\"../../dataset/raw/bibleis/INDASV/transcription.csv\")\n", "df_res_continue.shape" ] }, @@ -326,7 +352,8 @@ "ExecuteTime": { "end_time": "2020-02-03T03:13:16.671000Z", "start_time": "2020-02-03T03:13:16.513500Z" - } + }, + "hidden": true }, "outputs": [ { @@ -348,7 +375,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "hidden": true + }, "outputs": [], "source": [ "scraper.run(list(l_cont.values))" @@ -358,9 +387,145 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## INDWBT\n", - "\n", - "Currently not implemented." + "## INDWBT" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-28T09:08:33.556171Z", + "start_time": "2020-02-28T09:08:33.504166Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(198, 3)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_res_continue = pd.read_csv(\"../../dataset/raw/bibleis/INDWBT/transcription.csv\")\n", + "df_res_continue.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-28T09:08:37.412557Z", + "start_time": "2020-02-28T09:08:37.402556Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "62" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l_cont = list(set(scraper.urls) - set(df_res_continue['url']))\n", + "len(l_cont)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-28T09:10:40.480862Z", + "start_time": "2020-02-28T09:10:40.473861Z" + } + }, + "outputs": [], + "source": [ + "l_cont = sorted(l_cont)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Continue Audio only." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-29T14:31:42.053250Z", + "start_time": "2020-02-29T14:31:42.040750Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "243\n" + ] + }, + { + "data": { + "text/plain": [ + "'INDWBT_1CO_1.mp3'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mp3s = glob.glob(output_dir+'audio/*.mp3')\n", + "mp3s = [os.path.basename(s) for s in mp3s]\n", + "print(len(mp3s))\n", + "mp3s[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-29T14:31:44.045750Z", + "start_time": "2020-02-29T14:31:44.037000Z" + } + }, + "outputs": [], + "source": [ + "# Get missing audio files\n", + "missing_audios = list(set(urls_to_mp3s(scraper.urls)) - set(mp3s))\n", + "missing_audios_urls = mp3s_to_urls(missing_audios)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-29T14:31:44.607000Z", + "start_time": "2020-02-29T14:31:44.599500Z" + } + }, + "outputs": [], + "source": [ + "assert (len(missing_audios_urls) + len(mp3s)) == len(scraper.urls)" ] }, { @@ -372,10 +537,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "ExecuteTime": { - "start_time": "2020-02-27T05:31:38.166Z" + "end_time": "2020-02-29T14:38:27.453250Z", + "start_time": "2020-02-29T14:31:55.462000Z" }, "scrolled": false }, @@ -385,27 +551,34 @@ "output_type": "stream", "text": [ "Running scraper:\n", - "Scrape text: True\n", + "Scrape text: False\n", "Scrape audio: True\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9fb5d2f93bb54b02ac3a97681b2679a6", + "model_id": "c578566d147b419ba7a272fb728cbd24", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, max=260.0), HTML(value='')))" + "HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] } ], "source": [ - "scraper.run()" + "scraper.run(missing_audios_urls)" ] }, { @@ -417,11 +590,11 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2020-02-03T03:47:05.598500Z", - "start_time": "2020-02-03T03:47:05.361000Z" + "end_time": "2020-02-28T08:40:59.683596Z", + "start_time": "2020-02-28T08:40:59.655496Z" } }, "outputs": [ @@ -429,38 +602,187 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data written in ../../dataset/raw/bibleis/bibleis_transcription_v2_add.csv\n" + "Data written in ../../dataset/raw/bibleis/INDWBT/transcription_add.csv\n" ] } ], "source": [ - "scraper.write_csv()" + "scraper.write_csv(\"../../dataset/raw/bibleis/INDWBT/transcription_add.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "If needed, rescrape broken audio files." + "If rescraping, run this cell." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-28T08:43:10.759778Z", + "start_time": "2020-02-28T08:43:10.709598Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(198, 3)\n", + "(33, 3)\n" + ] + } + ], + "source": [ + "df_old = pd.read_csv(\"../../dataset/raw/bibleis/INDWBT/transcription.csv\")\n", + "df_old = df_old.drop(df_old.columns[0], axis=1)\n", + "df_new = pd.read_csv(\"../../dataset/raw/bibleis/INDWBT/transcription_add.csv\")\n", + "print(df_old.shape)\n", + "print(df_new.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-28T08:43:15.457630Z", + "start_time": "2020-02-28T08:43:15.348237Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(231, 3)\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mtest_df_join\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'../../dataset/raw/bibleis/INDWBT/transcription_join.csv'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[1;32massert\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtest_df_join\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mdf_join\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mtest_df_join\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 10\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[0mtest_df_join\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'../../dataset/raw/bibleis/INDWBT/transcription.csv'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "df_join = pd.concat([df_old, df_new]).sort_values(by=['url']).reset_index(drop=True)\n", + "print(df_join.shape)\n", + "\n", + "df_join.to_csv('../../dataset/raw/bibleis/INDWBT/transcription_join.csv', sep=',', line_terminator='\\n', index=False)\n", + "\n", + "# Test\n", + "test_df_join = pd.read_csv('../../dataset/raw/bibleis/INDWBT/transcription_join.csv')\n", + "\n", + "assert int((test_df_join == df_join).sum().mean()) == test_df_join.shape[0]\n", + "\n", + "test_df_join.to_csv('../../dataset/raw/bibleis/INDWBT/transcription.csv')\n", + "\n", + "os.remove(\"../../dataset/raw/bibleis/INDWBT/transcription_join.csv\")\n", + "os.remove(\"../../dataset/raw/bibleis/INDWBT/transcription_add.csv\")" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [], + "source": [ + "If needed, rescrape broken audio files." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-29T16:41:40.286750Z", + "start_time": "2020-02-29T16:41:40.256750Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "[]\n" + ] + } + ], "source": [ "import glob\n", - "mp3s = glob.glob(output_dir+\"audio/*.mp3\")\n", "\n", - "l = [x for x in mp3s if (os.stat(x).st_size < 3e5)]\n", - "smol_files = [\"https://live.bible.is/bible/\" + x[:-4].replace(\"_\", \"/\") + \"?audio_type=audio\" for x in l]\n", + "while True:\n", + " mp3s = glob.glob(output_dir+\"audio/*.mp3\")\n", + " mp3s = [x for x in mp3s if (os.stat(x).st_size < 3e5)]\n", + " mp3s = [os.path.basename(s) for s in mp3s]\n", + " smol_files = mp3s_to_urls(mp3s)\n", + " if len(smol_files == 0):\n", + " print(\"Downloaded all audio.\")\n", + " break\n", "\n", "print(len(smol_files))\n", - "print(smol_files[0])\n", - "\n", + "print(smol_files)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2020-02-29T16:40:09.513750Z", + "start_time": "2020-02-29T16:39:32.718750Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running scraper:\n", + "Scrape text: False\n", + "Scrape audio: True\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "92afed5b97c54a16be9caca2589cdf48", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ "scraper.run(smol_files)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " " + ] } ], "metadata": { diff --git a/notebooks/eda/2.0-glg-preprocess-bibleis.ipynb b/notebooks/eda/2.0-glg-preprocess-bibleis.ipynb index be2cf8e..dfda832 100644 --- a/notebooks/eda/2.0-glg-preprocess-bibleis.ipynb +++ b/notebooks/eda/2.0-glg-preprocess-bibleis.ipynb @@ -44,9 +44,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Preprocess Transcription for Alignment\n", + "# Preprocess Transcription for Alignment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## INDASV\n", "\n", - "Preprocess each chapter transcription by:\n", + "Preprocess each chapter transcription according to these steps:\n", "1. Splitting each sentence divided by '.'\n", "2. Removing any character except `\"a-z\"`, `\".\"`, `\",\"`, `\"\"`,\n", "3. Write each of chapter verse to a `.txt` file." @@ -220,13 +227,42 @@ "# f.writelines(x[1])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## INDWBT\n", + "\n", + "Preprocess each chapter transcription according to these steps:\n", + "1. For every beginning chapter verse (e.g. MAT1, MRK1), append additional unique format speech transcription according to what is practiced usually in the church. (e.g. 1CO_1 --> *'Surat Rasul Paulus yang pertama kepada jemaat di Korintus pasal satu\".*\n", + "2. For the rest, insert at the beginning a sentence reading the chapter and verse (e.g MAT1 --> *'Matius pasal satu'*)\n", + "3. (Optional) Do additional splitting (e.g. `smart split`, `split by comma`. The default is `split by verse`.\n", + "4. Removing any character except `\"a-z\"`, `\".\"`, `\",\"`, `\"\"`,\n", + "5. Write transcription in new `.txt` format compatible with `Aeneas` simple plain input format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dict_add = {\n", + " \"1CO_1\": \"\",\n", + " \"1JN_1\": \"\",\n", + " \"1PE_1\"\n", + "}" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Preprocess Audio & Text After Alignment\n", "\n", - "Given aligned `.json` from aeneas output, split each audio sentence into its own `.mp3` and `.txt` files." + "Given aligned `.json` from aeneas output, split each audio sentence into its own `.mp3` and `.txt` files.\n", + "\n", + "\\*\\***NOTE**\\*\\*: This notebook use single thread to do the split. Consult `2.0-glg-split_mp.py` for multiprocess approach." ] }, { @@ -5035,7 +5071,9 @@ "source": [ "# Extract Audio Features\n", "\n", - "Given splitted `.mp3` files, extract the features and write in `.npz` format." + "Given splitted `.mp3` files, extract the features and write in `.npz` format. \n", + "\n", + "\\*\\***NOTE**\\*\\*: This notebook use single thread to do the split. Consult `hr-extraction_pipeline_mp.py` for multiprocess approach." ] }, { From c2ef500efd9db5cd2a6cf49aaeac121ba6b8a822 Mon Sep 17 00:00:00 2001 From: "Gunawan Lumban Gaol (ID)" Date: Wed, 4 Mar 2020 17:05:30 +0700 Subject: [PATCH 4/5] finish INDBWT --- gurih/data/scraper.py | 4 +- notebooks/eda/1.0-glg-scrape-bibleis.ipynb | 221 ++++++---- .../eda/2.0-glg-preprocess-bibleis.ipynb | 408 +++++++++++++++++- 3 files changed, 534 insertions(+), 99 deletions(-) diff --git a/gurih/data/scraper.py b/gurih/data/scraper.py index 4e4eff8..5301bba 100644 --- a/gurih/data/scraper.py +++ b/gurih/data/scraper.py @@ -278,7 +278,7 @@ def __get_chapter(url): cv_pattern = re.search("[^?]*", url[35:]).group().replace("/", '') # In case of starting with digit, WTF?? - if cv_pattern[0] in ['1', '2']: - cv_pattern = r'\3' + cv_pattern + if cv_pattern[0] in ['1', '2', '3']: + cv_pattern = r'\3' + cv_pattern[0] + ' ' + cv_pattern[1:] return cv_pattern diff --git a/notebooks/eda/1.0-glg-scrape-bibleis.ipynb b/notebooks/eda/1.0-glg-scrape-bibleis.ipynb index 0bb9a11..6149ca2 100644 --- a/notebooks/eda/1.0-glg-scrape-bibleis.ipynb +++ b/notebooks/eda/1.0-glg-scrape-bibleis.ipynb @@ -22,11 +22,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-02-29T12:31:26.936500Z", - "start_time": "2020-02-29T12:31:26.931500Z" + "end_time": "2020-03-04T06:09:56.389097Z", + "start_time": "2020-03-04T06:09:54.828097Z" } }, "outputs": [], @@ -43,11 +43,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-02-29T12:31:26.971500Z", - "start_time": "2020-02-29T12:31:26.946500Z" + "end_time": "2020-03-04T06:09:56.408097Z", + "start_time": "2020-03-04T06:09:56.395097Z" } }, "outputs": [], @@ -70,11 +70,11 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-02-29T12:31:29.114000Z", - "start_time": "2020-02-29T12:31:29.104000Z" + "end_time": "2020-03-04T06:09:56.421097Z", + "start_time": "2020-03-04T06:09:56.416097Z" } }, "outputs": [], @@ -90,11 +90,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-02-29T12:31:29.739000Z", - "start_time": "2020-02-29T12:31:29.706500Z" + "end_time": "2020-03-04T06:09:56.448097Z", + "start_time": "2020-03-04T06:09:56.432097Z" } }, "outputs": [ @@ -123,17 +123,17 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2020-02-29T12:31:32.127750Z", - "start_time": "2020-02-29T12:31:32.111500Z" + "end_time": "2020-03-04T06:09:56.719097Z", + "start_time": "2020-03-04T06:09:56.710097Z" } }, "outputs": [], "source": [ - "# scraper.scrape_audio = False\n", - "scraper.scrape_text = False" + "scraper.scrape_audio = False\n", + "# scraper.scrape_text = False" ] }, { @@ -254,11 +254,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2020-02-29T12:32:41.222500Z", - "start_time": "2020-02-29T12:31:36.349000Z" + "end_time": "2020-03-04T05:41:45.312097Z", + "start_time": "2020-03-04T05:41:23.562097Z" } }, "outputs": [], @@ -280,8 +280,8 @@ "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2020-02-29T14:31:38.707000Z", - "start_time": "2020-02-29T14:31:38.698250Z" + "end_time": "2020-03-04T05:42:03.995097Z", + "start_time": "2020-03-04T05:42:03.988097Z" } }, "outputs": [ @@ -392,21 +392,21 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2020-02-28T09:08:33.556171Z", - "start_time": "2020-02-28T09:08:33.504166Z" + "end_time": "2020-03-04T06:10:03.153097Z", + "start_time": "2020-03-04T06:10:03.098097Z" } }, "outputs": [ { "data": { "text/plain": [ - "(198, 3)" + "(260, 3)" ] }, - "execution_count": 41, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -416,23 +416,30 @@ "df_res_continue.shape" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Not yet scraped list." + ] + }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2020-02-28T09:08:37.412557Z", - "start_time": "2020-02-28T09:08:37.402556Z" + "end_time": "2020-03-04T06:09:59.973097Z", + "start_time": "2020-03-04T06:09:59.958097Z" } }, "outputs": [ { "data": { "text/plain": [ - "62" + "0" ] }, - "execution_count": 42, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -444,11 +451,11 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2020-02-28T09:10:40.480862Z", - "start_time": "2020-02-28T09:10:40.473861Z" + "end_time": "2020-03-04T06:10:00.595097Z", + "start_time": "2020-03-04T06:10:00.589097Z" } }, "outputs": [], @@ -465,11 +472,11 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": { "ExecuteTime": { - "end_time": "2020-02-29T14:31:42.053250Z", - "start_time": "2020-02-29T14:31:42.040750Z" + "end_time": "2020-03-04T05:42:11.429097Z", + "start_time": "2020-03-04T05:42:11.409097Z" } }, "outputs": [ @@ -477,7 +484,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "243\n" + "260\n" ] }, { @@ -486,7 +493,7 @@ "'INDWBT_1CO_1.mp3'" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -500,11 +507,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "metadata": { "ExecuteTime": { - "end_time": "2020-02-29T14:31:44.045750Z", - "start_time": "2020-02-29T14:31:44.037000Z" + "end_time": "2020-03-04T05:42:13.002097Z", + "start_time": "2020-03-04T05:42:12.990097Z" } }, "outputs": [], @@ -516,11 +523,11 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": { "ExecuteTime": { - "end_time": "2020-02-29T14:31:44.607000Z", - "start_time": "2020-02-29T14:31:44.599500Z" + "end_time": "2020-03-04T05:42:13.568097Z", + "start_time": "2020-03-04T05:42:13.561097Z" } }, "outputs": [], @@ -528,6 +535,40 @@ "assert (len(missing_audios_urls) + len(mp3s)) == len(scraper.urls)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Continue broken chapter string." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-04T06:10:05.137097Z", + "start_time": "2020-03-04T06:10:05.120097Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "29" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "null_chapter_string_mask = df_res_continue['chapter_string'].isnull()\n", + "l_cont = df_res_continue[null_chapter_string_mask]['url'].to_list()\n", + "len(l_cont)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -537,11 +578,11 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2020-02-29T14:38:27.453250Z", - "start_time": "2020-02-29T14:31:55.462000Z" + "end_time": "2020-03-04T06:18:46.120097Z", + "start_time": "2020-03-04T06:10:06.605097Z" }, "scrolled": false }, @@ -551,19 +592,19 @@ "output_type": "stream", "text": [ "Running scraper:\n", - "Scrape text: False\n", - "Scrape audio: True\n" + "Scrape text: True\n", + "Scrape audio: False\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c578566d147b419ba7a272fb728cbd24", + "model_id": "51df4989666848e9aa0f71b179223b2d", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))" + "HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))" ] }, "metadata": {}, @@ -578,7 +619,8 @@ } ], "source": [ - "scraper.run(missing_audios_urls)" + "# scraper.run()\n", + "scraper.run(l_cont)" ] }, { @@ -610,6 +652,20 @@ "scraper.write_csv(\"../../dataset/raw/bibleis/INDWBT/transcription_add.csv\")" ] }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-04T06:47:32.366097Z", + "start_time": "2020-03-04T06:47:32.326097Z" + } + }, + "outputs": [], + "source": [ + "test = pd.read_csv(\"../../dataset/raw/bibleis/INDWBT/transcription_2.csv\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -619,26 +675,16 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 133, "metadata": { "ExecuteTime": { - "end_time": "2020-02-28T08:43:10.759778Z", - "start_time": "2020-02-28T08:43:10.709598Z" + "end_time": "2020-03-04T06:58:43.564097Z", + "start_time": "2020-03-04T06:58:43.555097Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(198, 3)\n", - "(33, 3)\n" - ] - } - ], + "outputs": [], "source": [ "df_old = pd.read_csv(\"../../dataset/raw/bibleis/INDWBT/transcription.csv\")\n", - "df_old = df_old.drop(df_old.columns[0], axis=1)\n", "df_new = pd.read_csv(\"../../dataset/raw/bibleis/INDWBT/transcription_add.csv\")\n", "print(df_old.shape)\n", "print(df_new.shape)" @@ -646,11 +692,11 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 44, "metadata": { "ExecuteTime": { - "end_time": "2020-02-28T08:43:15.457630Z", - "start_time": "2020-02-28T08:43:15.348237Z" + "end_time": "2020-03-04T05:48:04.713267Z", + "start_time": "2020-03-04T05:48:04.590255Z" } }, "outputs": [ @@ -658,33 +704,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "(231, 3)\n" - ] - }, - { - "ename": "AssertionError", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAssertionError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mtest_df_join\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'../../dataset/raw/bibleis/INDWBT/transcription_join.csv'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[1;32massert\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtest_df_join\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mdf_join\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mtest_df_join\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 10\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[0mtest_df_join\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'../../dataset/raw/bibleis/INDWBT/transcription.csv'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mAssertionError\u001b[0m: " + "(260, 3)\n" ] } ], "source": [ "df_join = pd.concat([df_old, df_new]).sort_values(by=['url']).reset_index(drop=True)\n", "print(df_join.shape)\n", - "\n", "df_join.to_csv('../../dataset/raw/bibleis/INDWBT/transcription_join.csv', sep=',', line_terminator='\\n', index=False)\n", "\n", "# Test\n", "test_df_join = pd.read_csv('../../dataset/raw/bibleis/INDWBT/transcription_join.csv')\n", - "\n", - "assert int((test_df_join == df_join).sum().mean()) == test_df_join.shape[0]\n", - "\n", - "test_df_join.to_csv('../../dataset/raw/bibleis/INDWBT/transcription.csv')\n", + "# assert int((test_df_join == df_join).sum().mean()) == test_df_join.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-04T05:48:54.985294Z", + "start_time": "2020-03-04T05:48:54.979294Z" + } + }, + "outputs": [], + "source": [ + "test_df_join.to_csv('../../dataset/raw/bibleis/INDWBT/transcription.csv', sep=',', line_terminator='\\n', index=False)\n", "\n", "os.remove(\"../../dataset/raw/bibleis/INDWBT/transcription_join.csv\")\n", "os.remove(\"../../dataset/raw/bibleis/INDWBT/transcription_add.csv\")" diff --git a/notebooks/eda/2.0-glg-preprocess-bibleis.ipynb b/notebooks/eda/2.0-glg-preprocess-bibleis.ipynb index dfda832..03f9096 100644 --- a/notebooks/eda/2.0-glg-preprocess-bibleis.ipynb +++ b/notebooks/eda/2.0-glg-preprocess-bibleis.ipynb @@ -22,11 +22,11 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-02-14T06:31:46.519000Z", - "start_time": "2020-02-14T06:31:45.509000Z" + "end_time": "2020-03-04T07:00:08.660097Z", + "start_time": "2020-03-04T07:00:07.527097Z" } }, "outputs": [], @@ -243,17 +243,407 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 89, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-04T08:08:24.553034Z", + "start_time": "2020-03-04T08:08:24.531032Z" + } + }, "outputs": [], "source": [ - "dict_add = {\n", - " \"1CO_1\": \"\",\n", - " \"1JN_1\": \"\",\n", - " \"1PE_1\"\n", + "import re\n", + "\n", + "def spell_number(number):\n", + "# number = str(number)\n", + "# number = number.replace('.', '')\n", + " \n", + " base = ['nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh', 'delapan', 'sembilan']\n", + " numeric = [1000000000000000, 1000000000000, 1000000000000, 1000000000, 1000000, 1000, 100, 10, 1]\n", + " unit = ['kuadriliun', 'triliun', 'biliun', 'milyar', 'juta', 'ribu', 'ratus', 'puluh', '']\n", + " out = ''\n", + " \n", + " i = 0\n", + " \n", + " if number == 0:\n", + " out = 'nol'\n", + " else:\n", + " while (number != 0):\n", + " count = int(number / numeric[i])\n", + " \n", + " if (count >= 10):\n", + " out += str(count) + ' ' + unit[i] + ' '\n", + " elif (count > 0 and count < 10):\n", + " out += base[count] + ' ' + unit[i] + ' '\n", + " \n", + " number -= numeric[i] * count\n", + " i += 1\n", + " \n", + "# print(out)\n", + " out = re.sub('satu puluh (\\w+)', r'\\1 belas', out)\n", + "# print(out)\n", + " out = re.sub('satu (ribu|ratus|puluh|belas)', r'se\\1', out)\n", + "# print(out)\n", + " out = re.sub('\\s{2,}', ' ', out.strip())\n", + " \n", + " return out" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-04T08:11:29.833097Z", + "start_time": "2020-03-04T08:11:29.819097Z" + } + }, + "outputs": [], + "source": [ + "dict_code_ind = {\n", + " \"1CO\": \"Satu Korintus\",\n", + " \"2CO\": \"Dua Korintus\",\n", + " \"1JN\": \"Satu Yohanes\",\n", + " \"2JN\": \"Dua Yohanes\",\n", + " \"3JN\": \"Tiga Yohanes\",\n", + " \"1PE\": \"Satu Petrus\",\n", + " \"2PE\": \"Dua Petrus\",\n", + " \"1TH\": \"Satu Tesalonika\",\n", + " \"2TH\": \"Dua Tesalonika\",\n", + " \"1TI\": \"Satu Timotius\",\n", + " \"2TI\": \"Dua Timotius\",\n", + " \"ACT\": \"Kisah\",\n", + " \"COL\": \"Kolose\",\n", + " \"EPH\": \"Efesus\",\n", + " \"GAL\": \"Galatia\",\n", + " \"HEB\": \"Ibrani\",\n", + " \"JAS\": \"Yakobus\",\n", + " \"JHN\": \"Yohanes\",\n", + " \"JUD\": \"Yudas\",\n", + " \"LUK\": \"Lukas\",\n", + " \"MAT\": \"Matius\",\n", + " \"MRK\": \"Markus\",\n", + " \"PHM\": \"Filemon\",\n", + " \"PHP\": \"Filipi\",\n", + " \"REV\": \"Wahyu\",\n", + " \"ROM\": \"Roma\",\n", + " \"TIT\": \"Titus\",\n", "}" ] }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-04T08:11:29.926097Z", + "start_time": "2020-03-04T08:11:29.918097Z" + } + }, + "outputs": [], + "source": [ + "dict_header = {\n", + " \"1CO\": \"Surat Paulus yang pertama kepada jemaat Korintus\",\n", + " \"2CO\": \"Surat Paulus yang kedua kepada jemaat Korintus\",\n", + " \"1JN\": \"Surat Yohanes yang pertama\",\n", + " \"2JN\": \"Surat Yohanes yang kedua\",\n", + " \"3JN\": \"Surat Yohanes yang ketiga\",\n", + " \"1PE\": \"Surat Petrus yang pertama\",\n", + " \"2PE\": \"Surat Petrus yang kedua\",\n", + " \"1TH\": \"Surat Paulus yang pertama kepada jemaat Tesalonika\",\n", + " \"2TH\": \"Surat Paulus yang kedua kepada jemaat Tesalonika\",\n", + " \"1TI\": \"Surat Paulus yang pertama kepada Timotius\",\n", + " \"2TI\": \"Surat Paulus yang kedua kepada Timotius\",\n", + " \"ACT\": \"Kisah Para Rasul\",\n", + " \"COL\": \"Surat Paulus kepada jemaat Kolose\",\n", + " \"EPH\": \"Surat Paulus kepada jemaat Efesus\",\n", + " \"GAL\": \"Surat Paulus kepada jemaat-jemaat di Propinsi Galatia\",\n", + " \"HEB\": \"Surat kepada jemaat bangsa Ibrani\",\n", + " \"JAS\": \"Surat Yakobus\",\n", + " \"JHN\": \"Injil Yohanes\",\n", + " \"JUD\": \"Surat Yudas\",\n", + " \"LUK\": \"Injil Lukas\",\n", + " \"MAT\": \"Injil Matius\",\n", + " \"MRK\": \"Injil Markus\",\n", + " \"PHM\": \"Surat Paulus kepada Bapak Filemon\",\n", + " \"PHP\": \"Surat Paulus kepada jemaat Filipi\",\n", + " \"REV\": \"Wahyu, penglihatan Yohanes tentang peristiwa-peristiwa yang akan datang\",\n", + " \"ROM\": \"Surat Paulus kepada jemaat Roma\",\n", + " \"TIT\": \"Surat Paulus kepada Titus\",\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-04T08:15:13.869097Z", + "start_time": "2020-03-04T08:15:13.807097Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
urlchapter_stringaudio_title
0https://live.bible.is/bible/INDWBT/1CO/10?audi...Saya berkata seperti itu, Saudara-saudari, kar...INDWBT_1CO_10.mp3
1https://live.bible.is/bible/INDWBT/1CO/11?audi...Jadi ikutlah teladan saya, sama seperti saya j...INDWBT_1CO_11.mp3
2https://live.bible.is/bible/INDWBT/1CO/12?audi...Sekarang, Saudara-saudari, saya mau supaya kal...INDWBT_1CO_12.mp3
3https://live.bible.is/bible/INDWBT/1CO/13?audi...Sebagai contoh, kalau saya diberikan kemampuan...INDWBT_1CO_13.mp3
4https://live.bible.is/bible/INDWBT/1CO/14?audi...Oleh karena itu, biarlah kita selalu mengutama...INDWBT_1CO_14.mp3
5https://live.bible.is/bible/INDWBT/1CO/15?audi...Dan sekarang Saudara-saudari, saya ingin mengi...INDWBT_1CO_15.mp3
6https://live.bible.is/bible/INDWBT/1CO/16?audi...Dan sekarang saya mau memberi petunjuk tentang...INDWBT_1CO_16.mp3
7https://live.bible.is/bible/INDWBT/1CO/1?audio...-\\n\\nKepada yang kekasih Saudara-saudari saya ...INDWBT_1CO_1.mp3
8https://live.bible.is/bible/INDWBT/1CO/2?audio...Demikian juga, Saudara-saudari, ketika saya pe...INDWBT_1CO_2.mp3
9https://live.bible.is/bible/INDWBT/1CO/3?audio...Dan Saudara-saudari, dulu saya tidak bisa berb...INDWBT_1CO_3.mp3
\n", + "
" + ], + "text/plain": [ + " url \\\n", + "0 https://live.bible.is/bible/INDWBT/1CO/10?audi... \n", + "1 https://live.bible.is/bible/INDWBT/1CO/11?audi... \n", + "2 https://live.bible.is/bible/INDWBT/1CO/12?audi... \n", + "3 https://live.bible.is/bible/INDWBT/1CO/13?audi... \n", + "4 https://live.bible.is/bible/INDWBT/1CO/14?audi... \n", + "5 https://live.bible.is/bible/INDWBT/1CO/15?audi... \n", + "6 https://live.bible.is/bible/INDWBT/1CO/16?audi... \n", + "7 https://live.bible.is/bible/INDWBT/1CO/1?audio... \n", + "8 https://live.bible.is/bible/INDWBT/1CO/2?audio... \n", + "9 https://live.bible.is/bible/INDWBT/1CO/3?audio... \n", + "\n", + " chapter_string audio_title \n", + "0 Saya berkata seperti itu, Saudara-saudari, kar... INDWBT_1CO_10.mp3 \n", + "1 Jadi ikutlah teladan saya, sama seperti saya j... INDWBT_1CO_11.mp3 \n", + "2 Sekarang, Saudara-saudari, saya mau supaya kal... INDWBT_1CO_12.mp3 \n", + "3 Sebagai contoh, kalau saya diberikan kemampuan... INDWBT_1CO_13.mp3 \n", + "4 Oleh karena itu, biarlah kita selalu mengutama... INDWBT_1CO_14.mp3 \n", + "5 Dan sekarang Saudara-saudari, saya ingin mengi... INDWBT_1CO_15.mp3 \n", + "6 Dan sekarang saya mau memberi petunjuk tentang... INDWBT_1CO_16.mp3 \n", + "7 -\\n\\nKepada yang kekasih Saudara-saudari saya ... INDWBT_1CO_1.mp3 \n", + "8 Demikian juga, Saudara-saudari, ketika saya pe... INDWBT_1CO_2.mp3 \n", + "9 Dan Saudara-saudari, dulu saya tidak bisa berb... INDWBT_1CO_3.mp3 " + ] + }, + "execution_count": 167, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('../../dataset/raw/bibleis/INDWBT/transcription.csv')\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-04T08:15:14.372097Z", + "start_time": "2020-03-04T08:15:14.339097Z" + } + }, + "outputs": [], + "source": [ + "pre_additions = []\n", + "for chapter_string, audio_title in df[['chapter_string', 'audio_title']].values:\n", + " code = audio_title[7:7+3]\n", + " verse = audio_title[7+4:-4]\n", + " pre_addition = ''\n", + " if verse == '1':\n", + " pre_addition += dict_header[code] + \". \"\n", + " pre_addition += dict_code_ind[code] + \" pasal \" + spell_number(int(verse))\n", + " pre_addition += \".\\n\\n\"\n", + " pre_additions.append(pre_addition)" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-04T08:15:15.531097Z", + "start_time": "2020-03-04T08:15:15.502097Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
urlchapter_stringaudio_title
0https://live.bible.is/bible/INDWBT/1CO/10?audi...Satu Korintus pasal sepuluh.\\n\\nSaya berkata s...INDWBT_1CO_10.mp3
1https://live.bible.is/bible/INDWBT/1CO/11?audi...Satu Korintus pasal sebelas.\\n\\nJadi ikutlah t...INDWBT_1CO_11.mp3
2https://live.bible.is/bible/INDWBT/1CO/12?audi...Satu Korintus pasal dua belas.\\n\\nSekarang, Sa...INDWBT_1CO_12.mp3
3https://live.bible.is/bible/INDWBT/1CO/13?audi...Satu Korintus pasal tiga belas.\\n\\nSebagai con...INDWBT_1CO_13.mp3
4https://live.bible.is/bible/INDWBT/1CO/14?audi...Satu Korintus pasal empat belas.\\n\\nOleh karen...INDWBT_1CO_14.mp3
\n", + "
" + ], + "text/plain": [ + " url \\\n", + "0 https://live.bible.is/bible/INDWBT/1CO/10?audi... \n", + "1 https://live.bible.is/bible/INDWBT/1CO/11?audi... \n", + "2 https://live.bible.is/bible/INDWBT/1CO/12?audi... \n", + "3 https://live.bible.is/bible/INDWBT/1CO/13?audi... \n", + "4 https://live.bible.is/bible/INDWBT/1CO/14?audi... \n", + "\n", + " chapter_string audio_title \n", + "0 Satu Korintus pasal sepuluh.\\n\\nSaya berkata s... INDWBT_1CO_10.mp3 \n", + "1 Satu Korintus pasal sebelas.\\n\\nJadi ikutlah t... INDWBT_1CO_11.mp3 \n", + "2 Satu Korintus pasal dua belas.\\n\\nSekarang, Sa... INDWBT_1CO_12.mp3 \n", + "3 Satu Korintus pasal tiga belas.\\n\\nSebagai con... INDWBT_1CO_13.mp3 \n", + "4 Satu Korintus pasal empat belas.\\n\\nOleh karen... INDWBT_1CO_14.mp3 " + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['chapter_string'] = pre_additions + df['chapter_string']\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-04T08:15:21.299097Z", + "start_time": "2020-03-04T08:15:21.217097Z" + } + }, + "outputs": [], + "source": [ + "df.to_csv('../../dataset/raw/bibleis/INDWBT/transcription.csv', index=False, sep=',', line_terminator='\\n')" + ] + }, { "cell_type": "markdown", "metadata": {}, From 15a7417e8de3e2404020201c29f36d02e503bd47 Mon Sep 17 00:00:00 2001 From: "Gunawan Lumban Gaol (ID)" Date: Thu, 12 Mar 2020 17:33:15 +0700 Subject: [PATCH 5/5] fix span issues Now both version use same algorithm to scrape. Fix by filtering getInnerHtml function to find all character until first occurence of '<'. --- gurih/data/scraper.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/gurih/data/scraper.py b/gurih/data/scraper.py index 5301bba..50de5ac 100644 --- a/gurih/data/scraper.py +++ b/gurih/data/scraper.py @@ -216,6 +216,30 @@ def _check_null_df(self, df): def _scrape_text_indasv(self, driver, url): chapter_string = '' + # Get all verses + cv = self.__get_chapter(url) + chapter_section = driver.find_element_by_css_selector(".chapter") + css_pattern = f"p[data-id^={cv}], span[data-id^={cv}], div[data-id^={cv}]" + data = chapter_section.find_elements_by_css_selector(css_pattern) + + verses = [] + for d in data: + d_text = d.get_attribute("innerHTML") + idx = d_text.find('<') # get all innerHTML until the first '<' + if idx != 0: + d_text = d_text[:idx] + verses.extend([d_text]) + + chapter_string = '\n\n'.join(verses) + + # Clean with class="note" + chapter_string = re.sub('', '', chapter_string) + + return chapter_string + + def _scrape_text_indasv_old(self, driver, url): + chapter_string = '' + # Get all verses chapter_section = driver.find_element_by_css_selector(".chapter") ps = chapter_section.find_elements_by_css_selector("p") @@ -261,6 +285,9 @@ def _scrape_text_indwbt(self, driver, url): verses = [] for d in data: d_text = d.get_attribute("innerHTML") + idx = d_text.find('<') # get all innerHTML until the first '<' + if idx != 0: + d_text = d_text[:idx] verses.extend([d_text]) chapter_string = '\n\n'.join(verses)