forked from boostcampaitech5/level3_nlp_finalproject-nlp-08
-
Notifications
You must be signed in to change notification settings - Fork 0
/
spellchecker.py
72 lines (54 loc) · 2.08 KB
/
spellchecker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from tqdm import trange
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
def set_options():
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument('--no-sandbox')
options.add_argument("--single-process")
options.add_argument("--disable-dev-shm-usage")
return options
def wait_driver_click(id):
WebDriverWait(driver, timeout=60).until(lambda d: d.find_element(By.ID, id))
driver.find_element(By.ID, id).click()
def spell_check(txt, delay = 15):
if len(txt) > 1200:
return ''
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.ID, 'character_counter_content'))
driver.find_element(By.ID, 'character_counter_content').send_keys(txt)
wait_driver_click('spell_check')
driver.implicitly_wait(delay)
wait_driver_click('spell_done_all')
driver.implicitly_wait(1)
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.CSS_SELECTOR, '#checker_preview'))
clean_txt = driver.find_element(By.CSS_SELECTOR, '#checker_preview').text
driver.refresh()
return clean_txt
dt = pd.read_csv('./data/preprocessed_qa_spacing_pre_word.csv')
new_q = []
new_a = []
options = set_options()
# options = webdriver.ChromeOptions()
# options.add_argument("--headless")
# options.add_argument('--no-sandbox')
# options.add_argument("--single-process")
# options.add_argument("--disable-dev-shm-usage")
# driver = webdriver.Chrome(options=options)
driver = webdriver.Chrome()
driver.get('https://www.saramin.co.kr/zf_user/tools/character-counter')
q_v = dt['question'].values
a_v = dt['answer'].values
for i in trange(len(dt)):
if len(q_v[i]) > 1200:
continue
try:
new_q.append(spell_check(q_v[i]))
new_a.append(a_v[i])
except:
continue
driver.quit()
clean_data = pd.DataFrame({'question' : new_q, 'answer' :new_a})
clean_data.to_csv('./data/preprocessed_qa_spacing_spell.csv', index=False)