-
Notifications
You must be signed in to change notification settings - Fork 0
/
hardcoded_version.py
128 lines (99 loc) · 3.67 KB
/
hardcoded_version.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import re
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
def clean_text(text):
"""Clean up text content."""
# remove multiple consecutive empty lines
text = re.sub(r"\n\s*\n\s*\n", "\n\n", text)
# remove trailing whitespace
text = re.sub(r"\s+$", "", text, flags=re.MULTILINE)
# ensure proper spacing around headers
text = re.sub(r"(#{1,6})\s*(.+)", r"\1 \2", text)
return text
def format_markdown(content):
"""Apply consistent formatting to markdown content."""
lines = content.split("\n")
formatted_lines = []
in_code_block = False
for line in lines:
# skip empty lines at start of file
if not formatted_lines and not line.strip():
continue
# handle code blocks
if line.strip().startswith("```"):
in_code_block = not in_code_block
formatted_lines.append(line.rstrip())
continue
if in_code_block:
formatted_lines.append(line.rstrip())
continue
# format headers
if line.strip().startswith("#"):
# ensure single space after #
line = re.sub(r"^(#+)\s*", r"\1 ", line)
# add blank line before headers (except at start of file)
if formatted_lines and formatted_lines[-1].strip():
formatted_lines.append("")
formatted_lines.append(line.strip())
formatted_lines.append("")
continue
# format lists
if re.match(r"^\s*[-*+]\s", line):
formatted_lines.append(line.rstrip())
continue
# format numbered lists
if re.match(r"^\s*\d+\.\s", line):
formatted_lines.append(line.rstrip())
continue
# regular lines
if line.strip():
formatted_lines.append(line.strip())
else:
# only add blank line if previous line wasn't blank
if formatted_lines and formatted_lines[-1].strip():
formatted_lines.append("")
# clean up the final content
content = "\n".join(formatted_lines)
content = clean_text(content)
return content
def scrape_to_markdown(url, output_dir="scraped_mds"):
"""Scrape webpage and convert to formatted markdown."""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# remove unwanted elements
for element in soup.find_all(["script", "style", "nav", "footer"]):
element.decompose()
# convert to markdown
markdown_content = md(
str(soup), heading_style="ATX", bullets="-", code_language="python"
)
# format the content
formatted_content = format_markdown(markdown_content)
# create output directory
os.makedirs(output_dir, exist_ok=True)
# create filename from URL
filename = url.split("://")[-1].replace("/", "_")
if not filename.endswith(".md"):
filename += ".md"
filepath = os.path.join(output_dir, filename)
# write to file
with open(filepath, "w", encoding="utf-8") as f:
f.write(formatted_content)
print(f"Successfully scraped: {url} -> {filepath}")
except requests.RequestException as e:
print(f"Error scraping {url}: {e!s}")
except Exception as e:
print(f"Unexpected error processing {url}: {e!s}")
def main():
urls = [
"https://docs.jito.wtf/",
"https://docs.triton.one/project-yellowstone/whirligig-websockets",
]
for url in urls:
scrape_to_markdown(url)
if __name__ == "__main__":
main()