-
Notifications
You must be signed in to change notification settings - Fork 0
/
ScraperFromYandexText.py
122 lines (96 loc) · 4.62 KB
/
ScraperFromYandexText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import requests
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
downloaded_images_folder = 'F:\\roberta'
if not os.path.exists(downloaded_images_folder):
os.makedirs(downloaded_images_folder)
MAX_THREADS = concurrent.futures.ThreadPoolExecutor(max_workers=None) # Use max_workers=None for maximum threads
def download_image(link, folder, idx):
try:
response = requests.get(link, timeout=10, stream=True)
if response.status_code == 200:
total_size = int(response.headers.get('content-length', 0))
image_extension = link.split('.')[-1]
image_filename = f'image_{idx + 1}.{image_extension}'
image_path = os.path.join(folder, image_filename)
with open(image_path, 'wb') as f:
with tqdm(total=total_size, unit='B', unit_scale=True) as pbar:
for chunk in response.iter_content(1024):
if chunk:
f.write(chunk)
pbar.update(len(chunk))
return image_filename
else:
return None
except requests.exceptions.Timeout:
print(f"Connection timed out for link: {link}")
return None
def update_completed_downloads(link):
with open('D:\cool python projects\imx_scraper\completedDownloads.txt', 'a+') as f:
f.seek(0)
links_in_file = f.read().splitlines()
if link not in links_in_file:
f.write(link + '\n')
def process_link(link):
try:
response = requests.get(link, timeout=10)
html_content = response.content
soup = BeautifulSoup(html_content, 'html.parser')
title_element = soup.find('div', class_='title')
if title_element:
title = title_element.get_text(strip=True)
title = title.replace('/', '_')
title_folder = os.path.join(downloaded_images_folder, title)
if not os.path.exists(title_folder):
os.makedirs(title_folder)
image_containers = soup.find_all('div', class_='tooltip')
futures = []
for idx, container in enumerate(image_containers):
img_tag = container.find('img', class_='imgtooltip')
if img_tag:
img_src = img_tag['src']
img_src = img_src.replace('/t/', '/i/')
img_src = img_src.replace('>', '/')
futures.append(MAX_THREADS.submit(download_image, img_src, title_folder, idx))
for future in concurrent.futures.as_completed(futures):
image_filename = future.result()
if image_filename:
update_completed_downloads(link)
print(f"Downloaded {len(image_containers)} images in '{title}'")
with open(os.path.join(title_folder, 'details.txt'), 'w') as f:
for container in image_containers:
img_tag = container.find('img', class_='imgtooltip')
if img_tag:
img_src = img_tag['src']
f.write(img_src + '\n')
else:
print("Title element not found on the page.")
except requests.exceptions.Timeout:
print(f"Connection timed out for link: {link}")
def extract_and_save_links(text_file_path):
with open(text_file_path, 'r', encoding='utf-8') as file:
page_content = file.read()
# Use regular expression to find links matching the pattern imx.to›g/[\w-]+
links = re.findall(r'imx\.to›g/[\w-]+', page_content)
# Modify and save the links to links.txt
with open('D:\cool python projects\imx_scraper\links.txt', 'w') as links_file:
for link in links:
modified_link = 'https://' + link.replace('›', '/').replace('>', '/')
links_file.write(modified_link + '\n')
def main():
text_file_path = 'D:\\cool python projects\\imx_scraper\\text_corpus.txt' # Replace with your text file path
extract_and_save_links(text_file_path)
with open('D:\cool python projects\imx_scraper\links.txt', 'r') as file:
links = file.read().splitlines()
for link in links:
with open('D:\cool python projects\imx_scraper\completedDownloads.txt', 'r') as file:
completed_links = file.read().splitlines()
if link not in completed_links:
process_link(link)
print("All images downloaded and saved.")
if __name__ == "__main__":
main()