-
Notifications
You must be signed in to change notification settings - Fork 0
/
collector_chapter_3.py
76 lines (63 loc) · 2.94 KB
/
collector_chapter_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import asyncio
import random
import aiohttp
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
async def fetch_chapter(session, url, headers, count, chapter_name):
async with session.get(url, headers=headers) as resp:
src = await resp.text()
with open(f"data/html/{chapter_name}.html", "w", encoding='utf-8') as file:
file.write(src)
with open(f"data/html/{chapter_name}.html", encoding='utf-8') as file:
src = file.read()
soup = BeautifulSoup(src, "lxml")
chapter_content = soup.find(class_="chapter-content")
chapter_content_p = chapter_content.text
if chapter_content_p is None:
print("Ошибка: не удалось найти элемент с классом 'chapter-content'.")
return
h1_tag = soup.find('div', {'class': 'titles'})
for novel_name in h1_tag.find_all('a'):
novel_name_n = novel_name.get('title')
json_data = [{
"novel_name": novel_name_n,
"chapter_name": chapter_name,
"content": chapter_content_p
}]
with open(f"data/json/{count}.json", "w", encoding='utf-8') as file:
json.dump(json_data, file, indent=10, ensure_ascii=False)
# print(" Выделяем текст из главы", chapter_name, "и ведем запись...", end='\r', flush=True)
async def main():
with open(f"data/temp/url.txt", "r") as f:
url = f.read()
headers = {
"Accept":
"*/*",
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0"
}
# загрузить данные из json-файла
with open(f'data/temp/all_chapter_links.json', encoding='utf-8') as file:
all_chapter = json.load(file)
iteration_count = int(len(all_chapter))
count = 1
print(f" Количесво глав для чтения: {iteration_count}")
with tqdm(total=iteration_count, leave=True) as pbar:
async with aiohttp.ClientSession() as session:
tasks = []
for chapter_href, chapter_name in all_chapter.items():
# Оставьте только числовые символы в название_главы
chapter_name = "".join(
c for c in chapter_name if c.isnumeric())
# Сделайте что-нибудь с главой_href и главой_названием
task = asyncio.ensure_future(fetch_chapter(
session, chapter_href, headers, count, chapter_name))
tasks.append(task)
count += 1
iteration_count = iteration_count - 1
await asyncio.sleep(random.randrange(1, 3))
pbar.update(1)
await asyncio.gather(*tasks)
if __name__ == "__main__":
asyncio.run(main())