-
Notifications
You must be signed in to change notification settings - Fork 0
/
pi_1_build_simple_corpus.py
36 lines (26 loc) · 1.3 KB
/
pi_1_build_simple_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from tools.tokenize_docs import Tokenizer
from tools import coll
from tools.simple_corp import SimpleCorp
import os
from tqdm import tqdm
from tools.relative_paths_to_directories import path_to_directories
PATH_TO_ROOT, PATH_TO_TOOLS, PATH_TO_FILES, PATH_TO_TF_IDF, PATH_TO_INV_IND, PATH_TO_BM_25, \
PATH_TO_LEARNING_TO_RANK = path_to_directories(os.getcwd())
# директория на папку с кодексами
codexes_dir = os.path.join(PATH_TO_ROOT, "codexes")
tokenizer = Tokenizer()
simple_corp = SimpleCorp()
simple_corp_art_names = SimpleCorp()
for filename in tqdm(os.listdir(codexes_dir)):
d1, _ = coll.iter_by_docs(filename, codexes_dir, 'article', 0)
for doc_id, doc_text in d1.items():
simple_corp.add_doc(doc_id, doc_text)
for filename in tqdm(os.listdir(codexes_dir)):
names = coll.iter_by_docs(filename, codexes_dir, 'art_name', 1)
for doc_id, doc_text in names.items():
simple_corp_art_names.add_doc(doc_id, doc_text)
tokenized_corp = SimpleCorp()
tokenized_corp.make_from(simple_corp, tokenizer)
simple_corp.save('codexes_corp_articles', os.path.join(PATH_TO_FILES, "corp"))
tokenized_corp.save('codexes_tokenized_corp_articles', os.path.join(PATH_TO_FILES, "corp"))
simple_corp_art_names.save('codexes_corp_art_names', os.path.join(PATH_TO_FILES, "corp"))