-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_gesetz.py
62 lines (52 loc) · 2.24 KB
/
parse_gesetz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from bs4 import BeautifulSoup
from copy import deepcopy
from elasticsearch import Elasticsearch, helpers
import os
# Initialize Elasticsearch client
es = Elasticsearch("http://localhost:9202")
def parse_html(html_content):
soup = BeautifulSoup(html_content, "lxml-xml")
jurs: list[dict[str, str]] = []
jur = {}
for div in soup.find_all("div"):
if "class" in div.attrs:
if div.attrs["class"] == "jnnorm":
if (h1 := div.find("h1")):
jur["buch"] = h1.get_text(separator=" ").strip()
if (h2 := div.find("h2")):
jur["kapitel"] = h2.get_text(separator=" - ").strip()
if (h3 := div.find("h3")):
if (jnenbez := h3.find("span", class_="jnenbez")):
jur["abschnitt_n"] = jnenbez.text.strip()
if (jnentitel := h3.find("span", class_="jnentitel")):
jur["abschnitt_t"] = jnentitel.get_text(separator=" - ").strip()
elif div.attrs["class"] == "jnhtml":
for absatz in div.find_all("div", class_="jurAbsatz"):
jur["absatz"] = absatz.get_text(separator=" ").strip().replace("\n", " ").replace("§", " §")
jurs.append(deepcopy(jur))
return jurs
def process_html_files(file_path: str, parser=parse_html):
with open(file_path, "r", encoding="iso-8859-1") as file:
html_content = file.read()
div_texts = parser(html_content)
return div_texts
def create_index(index_name):
# Create an index with basic settings and mappings if it doesn't already exist
if not es.indices.exists(index=index_name):
es.indices.create(index=index_name, ignore=400) # 400 causes to ignore "Index Already Exists" error
def index_documents(docs, index_name):
actions = [
{
"_index": index_name,
"_source": doc
}
for doc in docs
]
helpers.bulk(es, actions)
if __name__ == "__main__":
es_index = "gesetz"
create_index(es_index)
for file in os.listdir("/home/nathan/Downloads"):
if file.endswith(".html"):
docs = process_html_files(f"/home/nathan/Downloads/{file}")
index_documents(docs, es_index)