-
Notifications
You must be signed in to change notification settings - Fork 2
/
docxtools.py
38 lines (32 loc) · 1.24 KB
/
docxtools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import docx
import statistics
import subprocess
import os
import re
class Document:
def __init__(self, file):
if file.endswith('.doc'):
subprocess.call(['soffice', '--headless', '--convert-to', 'docx', file])
docx_file = file.split('/')[-1] + 'x'
self.doc = docx.Document(docx_file)
os.remove(docx_file)
else:
self.doc = docx.Document(file)
hashes = []
for para in self.doc.paragraphs:
for run in para.runs:
hashes.append(self._get_hash(run))
self.body_hash = statistics.mode(hashes)
self.header_hash = None
def _get_hash(self, run):
return hash((run.font.size, run.font.name, run.bold, run.italic, run.font.color.rgb))
def get_text(self, section):
text = ''
for para in self.doc.paragraphs:
for run in para.runs:
text += run.text
return text.replace('\n', ' ')
f = open('full_texts.txt', 'w', encoding='utf-8')
for f_name in os.listdir('Umairs sample files'):
if '.doc' in f_name or '.docx' in f_name:
f.write(f_name + ','.join(re.findall('(METHOD|Method)s?\W+[A-Z]', Document(f'Umairs sample files/{f_name}').get_text('methods'))) + '\n')