Skip to content

Commit

Permalink
edit cleaners
Browse files Browse the repository at this point in the history
  • Loading branch information
kdrkdrkdr committed Jun 27, 2023
1 parent d4ed8b3 commit 84ee959
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 35 deletions.
39 changes: 18 additions & 21 deletions inference.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions text/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,6 @@ def korean_cleaners(text):
def jk_cleaners(text):
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_cleaners(x.group(1))+' ', text)
text = re.sub(r'\[KO\](.*?)\[KO\]', lambda x: korean_cleaners(x.group(1))+' ', text)
text = re.sub(r'\[PREPROCESSED\](.*?)\[PREPROCESSED\]', lambda x: x.group(1)+' ', text)
text = ''.join(_cleaner_cleans.findall(text))
return text
20 changes: 15 additions & 5 deletions text/j2k.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from .cleaners import japanese_to_romaji_with_accent
from .cleaners import japanese_to_romaji_with_accent, korean_cleaners
from .korean import join_jamos


repl_lst = {
'.': '. ',
'↓': '',
Expand All @@ -14,10 +13,15 @@
'Q': ' |Q ',
'N': ' |N ',
'U': 'u ',
'I': 'i ',
'A': 'a ',
'E': 'e ',
'O': 'o ',
}

repl_lst2 = {
'ʧu': '츠',
'tsu': '츠',
'zu': '즈',
'su': '스',

Expand All @@ -35,6 +39,7 @@
's': 'ㅅ',
'j': 'ㅈ',
'ʧ': 'ㅊ',
'ts': 'ㅊ',
'k': 'ㅋ',
't': 'ㅌ',
'p': 'ㅍ',
Expand Down Expand Up @@ -80,20 +85,25 @@
'|Nㅇ': 'ㅇㅇ',
'|Nㅎ': 'ㅇㅎ',

'|Q': 'ㅅ'
'|Q': 'ㅅ',
'|N': 'ㄴ',
}


def japanese2korean(text):
text = japanese_to_romaji_with_accent(text).strip()
text = japanese_to_romaji_with_accent(text).strip().replace('^', '').replace(' ', '^ ')

for k, v in repl_lst.items():
text = text.replace(k, v)
for k, v in repl_lst2.items():
text = text.replace(k, v)

text = ' '.join([i.replace('*', 'ㅇ') if i.startswith('*') else i.replace('*', '') for i in text.strip().split(' ')])

for k, v in repl_lst3.items():
text = text.replace(k, v)

return join_jamos(text.replace(' ', '_').replace(' ', ''))
text = join_jamos(text.replace(' ', ' ')).replace(' ', '').replace('^', ' ')
return f"[PREPROCESSED]{korean_cleaners(text)}[PREPROCESSED]"


15 changes: 6 additions & 9 deletions text/k2j.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
number_to_hangul,
g2pk,
)
from .cleaners import japanese_cleaners
import re
import jaconv

Expand Down Expand Up @@ -47,7 +48,8 @@ def get_word_list(text):

def korean2katakana(text):
word_lst = get_word_list(text)
text = '/' + text.replace('/', ' ').replace('|', ' ').replace('^', ' ').replace(' ', ' ').replace(' ', '^')
text = '/' + text.replace('/', ' ').replace('|', ' ').replace('^', ' ').replace(' ', ' ')
print(text)
new_lst = []

for i, s in enumerate(word_lst):
Expand All @@ -70,17 +72,12 @@ def korean2katakana(text):
new_lst.extend(dh)

kr = ''.join(new_lst)

for k, v in repl_lst.items():
kr = kr.replace(k, v)

kr2ro = japanese_to_romaji_with_accent(kr).replace('si', 'shi').replace('c', 'ts') \
.replace('ti', 'ティ').replace('tu', 'トゥ') \
.replace('di', 'ディ').replace('du', 'ドゥ')
result = jaconv.alphabet2kata(kr2ro)
result = result.replace('/', '').replace('|', 'ー').replace('^', '_')
print(result)
return result


# print(korean2katakana("안녕하세요.")) -> アンニョンーハセヨ
result = jaconv.alphabet2kata(kr2ro).replace('|', 'ー').replace('/', '').replace('^', '')
result = result if result[-1] == '.' else result + '.'
return f'[PREPROCESSED]{japanese_cleaners(result).replace(" ", "")}[PREPROCESSED]'

0 comments on commit 84ee959

Please sign in to comment.