forked from shelari/prereform_to_contemporary
-
Notifications
You must be signed in to change notification settings - Fork 1
/
process.py
136 lines (130 loc) · 7.86 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# -*- coding: utf-8 -*-
__author__ = 'ElenaSidorova'
from copy import deepcopy
from preprocess import Preprocessor
from tokenizer import Tokenizer
from transliterator import Transliterator
from meta_data import META
import json
import subprocess
class Processor(object):
@classmethod
def process_text(cls, text, show, delimiters, check_brackets, print_log=True):
text = Preprocessor.preprocess_text(text)
tokens = Tokenizer.tokenize(text)
for i in tokens.keys():
if tokens[i].type == 'word':
word = Transliterator.transliterate(tokens[i].word, print_log)
if word != tokens[i].word:
tokens[i].old_word = deepcopy(tokens[i].word)
tokens[i].word = word
text, changes, wrong_edits = cls.join_tokens(tokens, show, delimiters, check_brackets)
str_json = cls.to_json(tokens)
return text, changes, wrong_edits, str_json
@classmethod
def to_json(cls, tokens):
jn = {}
for key in tokens.keys():
jn[str(key)] = {}
jn[str(key)]['word'] = tokens[key].word
jn[str(key)]['old_word'] = tokens[key].old_word
jn[str(key)]['type'] = tokens[key].type
jn[str(key)]['plain_word'] = tokens[key].plain_word
jn[str(key)]['old_plain_word'] = tokens[key].old_plain_word
str_json = json.dumps(jn)
return str_json
@classmethod
def join_tokens(cls, tokens, show, delimiters, check_brackets):
text = []
changes = []
spelling = []
wrong_changes = []
for i in range(len(tokens.keys())):
if check_brackets:
if u'[' in tokens[i].word and tokens[i].type == 'word':
new = u''
new += META['spelling_delimiters_upd'][0]
if tokens[i].old_word:
new += tokens[i].old_word.replace(u"'", u"'")
else:
new += tokens[i].word.replace(u"'", u"'")
new += META['spelling_delimiters_upd'][1]
if tokens[i].old_word:
new_vers = tokens[i].word.split(u'[')[1].split(u']')[0]
old_vers = tokens[i].old_word.split(u'[')[1].split(u']')[0]
if new_vers == old_vers:
new = new + tokens[i].word.split(u'[')[0] + tokens[i].word.split(u']')[1] + \
META['spelling_delimiters_upd'][2] + tokens[i].word.replace(u'[', u'').replace(u']', u'') + \
META['spelling_delimiters_upd'][3]
else:
new_spell = tokens[i].word.split(u'[')[0] + tokens[i].word.split(u']')[1]
old_spell = tokens[i].old_word.split(u'[')[0] + tokens[i].old_word.split(u']')[1]
if new_spell == old_spell:
new = new + tokens[i].word.split(u'[')[0] + tokens[i].word.split(u']')[1] + \
META['spelling_delimiters_upd'][2] + delimiters[0] + \
tokens[i].word.replace(u'[', u'').replace(u']', u'') + delimiters[1] + \
tokens[i].old_word.replace(u'[', u'').replace(u']', u'') + delimiters[2] + \
META['spelling_delimiters_upd'][3]
else:
new = new + delimiters[0] + tokens[i].word.split(u'[')[0] + tokens[i].word.split(u']')[1] + \
delimiters[1] + tokens[i].old_word.split(u'[')[0] + tokens[i].old_word.split(u']')[1] + \
delimiters[2] + META['spelling_delimiters_upd'][2] + delimiters[0] + \
tokens[i].word.replace(u'[', u'').replace(u']', u'') + delimiters[1] + \
tokens[i].old_word.replace(u'[', u'').replace(u']', u'') + delimiters[2] + \
META['spelling_delimiters_upd'][3]
else:
new = new + tokens[i].word.split(u'[')[0] + tokens[i].word.split(u']')[1] + \
META['spelling_delimiters_upd'][2] + tokens[i].word.replace(u'[', u'').replace(u']', u'') + \
META['spelling_delimiters_upd'][3]
text.append(new)
if tokens[i].old_word:
spelling.append(tokens[i].word.replace(u'[', u'').replace(u']', u''))
s = tokens[i].old_word + u' --> ' + tokens[i].word
changes.append(s)
else:
if tokens[i].old_word:
new = delimiters[0] + tokens[i].word + delimiters[1] + \
tokens[i].old_word + delimiters[2]
text.append(new)
spelling.append(tokens[i].word)
s = tokens[i].old_word + u'\t-->\t' + tokens[i].word
changes.append(s)
else:
text.append(tokens[i].word)
else:
if tokens[i].old_word:
if show:
new = delimiters[0] + tokens[i].word + delimiters[1] + \
tokens[i].old_word + delimiters[2]
else:
new = tokens[i].word
text.append(new)
s = tokens[i].old_word + u' --> ' + tokens[i].word
changes.append(s)
else:
text.append(tokens[i].word)
# if spelling:
# cmd = "echo " + u' '.join(spelling) + " | hunspell -d ru_Ru"
# p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, executable="/bin/bash")
# spelled, err_sp = p.communicate()
# spelled = spelled.split('\n')[1:][:-2]
# for j, sp in enumerate(spelled):
# if sp [0] == u'&':
# wrong_changes.append(changes[j])
# changes[j] = changes[j] + u' *'
if changes == []:
out = u''
else:
out = u'\n'.join(changes)
return u''.join(text), out, wrong_changes
# text = u'Пройдя комнату, такъ [называемую], офиціанскую, мы взошли въ кабинетъ Папа. Онъ стоялъ подлѣ письменнаго стола и, показывая на бумаги, запечатанные конверты, кучки денегъ, горячился и что-то толковалъ прикащику Никитѣ Петрову, который на обычно[мъ] своемъ мѣстѣ, подлѣ барометра, разставивъ ноги на приличное раз[стояніе], заложивъ руки назадъ и приводя за спиною пальцы въ движеніе тѣмъ быстрѣе, чѣмъ болѣе горячился [13] папа, спереди не выказывалъ ни малѣйшаго знака безпокойства, но, напротивъ, выраженіемъ лица выказывалъ совершенное сознаніе своей правоты и вмѣстѣ съ тѣмъ подвластности.'
# text = u'df 13 fsdf'
# text = u'офиціанскую'
# text = u' обычно[мъ] '
# text = u'который [на] обычно[мъ] [своемъ] мѣстѣ, под[лѣ] баро[метра], разст[авивъ], любо[въ]'
# import codecs
# with codecs.open(u'/Users/el/Downloads/vol. 1/index.html', 'r', 'utf-8') as inf:
# text = inf.read()
# a = Processor()
# b, c, r, m = a.process_text(text, 1, [u'<choice><reg>', u'</reg><orig>', u'</orig></choice>'], 1)
# print b