-
Notifications
You must be signed in to change notification settings - Fork 0
/
docx2txt.py
113 lines (80 loc) · 2.98 KB
/
docx2txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import re
from docx import Document
# Install python-docx package to convert MS Word documents
# pip install python-docx
def convert_zero(txt):
return re.sub(r'([\u0660]|[\u06F0])', "0", txt)
def convert_one(txt):
return re.sub(r'([\u0661]|[\u06F1])', "1", txt)
def convert_two(txt):
return re.sub(r'([\u0662]|[\u06F2])', "2", txt)
def convert_three(txt):
return re.sub(r'([\u0663]|[\u06F3])', "3", txt)
def convert_four(txt):
return re.sub(r'([\u0664]|[\u06F4])', "4", txt)
def convert_five(txt):
return re.sub(r'([\u0665]|[\u06F5])', "5", txt)
def convert_six(txt):
return re.sub(r'([\u0666]|[\u06F6])', "6", txt)
def convert_seven(txt):
return re.sub(r'([\u0667]|[\u06F7])', "7", txt)
def convert_eight(txt):
return re.sub(r'([\u0668]|[\u06F8])', "8", txt)
def convert_nine(txt):
return re.sub(r'([\u0669]|[\u06F9])', "9", txt)
def convert_numbers(txt):
txt = convert_zero(txt)
txt = convert_one(txt)
txt = convert_two(txt)
txt = convert_three(txt)
txt = convert_four(txt)
txt = convert_five(txt)
txt = convert_six(txt)
txt = convert_seven(txt)
txt = convert_eight(txt)
txt = convert_nine(txt)
return txt
for f in os.listdir("."):
if not f.endswith(".docx") and not f.endswith(".txt"):
continue
is_docx = f.endswith(".docx")
with open(f, 'rb' if is_docx else 'r') as doc:
if is_docx:
document = Document(doc)
lines = document.paragraphs
else:
lines = doc.readlines()
with open(os.path.splitext(doc.name)[0] + ".usfm", 'w+') as usfm:
usfm.write('\\id BOOK_CODE Unlocked Literal Bible \n')
usfm.write('\\ide UTF-8 \n')
usfm.write('\\h Book_Name \n')
usfm.write('\\toc1 Book_Name_Extended \n')
usfm.write('\\toc2 Book_Name \n')
usfm.write('\\toc3 Book_Code \n')
usfm.write('\\mt Book_Name \n\n')
is_first_paragraph = False
for line in lines:
if is_docx:
line = line.text
line = line.strip()
if len(line) == 0:
continue
reg = re.compile(r'([0-9]+)')
# Convert arabic numbers if there any
p_line = convert_numbers(line)
m = reg.search(p_line)
if m is not None:
if len(line) <= 3:
# chapter marker
usfm.write('\\s5\n')
usfm.write('\\c ' + m.group() + '\n\n')
is_first_paragraph = True
else:
# verse markers
v_line = reg.sub(r'\n\\v \1 ', p_line)
if not is_first_paragraph:
usfm.write('\\s5\n')
usfm.write('\\p')
usfm.write(v_line + '\n\n')
is_first_paragraph = False