diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index fb7c5e5..cdffdf3 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -28,7 +28,6 @@ concurrency: jobs: test: name: 🐍 Python ${{ matrix.python-version }} on ${{ matrix.os }} - if: startsWith(github.event.head_commit.message, 'Bump version to') || startsWith(github.event.head_commit.message, '[CI]') strategy: fail-fast: false matrix: diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 11e74b6..53daa1c 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -51,7 +51,6 @@ jobs: ### Available Extras: - 🎬 YouTube Tools: `streamsnapper[youtube]` - - 📥 Downloader: `streamsnapper[downloader]` - 🔄 Merger: `streamsnapper[merger]` - ✨ All Features: `streamsnapper[all]` @@ -63,7 +62,7 @@ jobs: ### 📊 Compare Changes - [View all changes](https://github.com/Henrique-Coder/streamsnapper/compare/${{ steps.previoustag.outputs.tag }}...v${{ steps.version.outputs.version }}) + [View all changes](https://github.com/henrique-coder/streamsnapper/compare/${{ steps.previoustag.outputs.tag }}...v${{ steps.version.outputs.version }}) draft: false prerelease: false env: diff --git a/README.md b/README.md index 6d890b1..71fd0a1 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,6 @@ StreamSnapper is an intuitive library designed to simplify, enhance, and organiz ```bash pip install -U streamsnapper # It does not have any features by default, but it can be extended with optional features -pip install -U streamsnapper[downloader] # It has the feature of downloading online content with support for multiple simultaneous connections pip install -U streamsnapper[merger] # It has the feature of merging video files with audio files using FFmpeg (currently it does not need any dependencies) pip install -U streamsnapper[youtube] # It has advanced features to extract data from YouTube, with support for several other features pip install -U streamsnapper[all] # It has all features available at once @@ -20,37 +19,6 @@ pip install -U streamsnapper[all] # It has all features available at once ### Example Usage -#### `streamsnapper[downloader]` - -```python -from streamsnapper import Downloader -from pathlib import Path # Optional - - -# A class for downloading direct download URLs. -downloader = Downloader( - # Initialize the Downloader class with the required settings for downloading a file. - max_connections='auto', # The maximum number of connections to use for downloading the file. (default: 'auto') - connection_speed=80, # The connection speed in Mbps. (default: 80) - overwrite=True, # Overwrite the file if it already exists. Otherwise, a "_1", "_2", etc. suffix will be added. (default: True) - show_progress_bar=True, # Show or hide the download progress bar. (default: True) - custom_headers=None, # Custom headers to include in the request. If None, default headers will be used. Imutable headers are 'Accept-Encoding' and 'Range'. (default: None) - timeout=None # Timeout in seconds for the download process. Or None for no timeout. (default: None) -) - -# Downloads a file from the provided URL to the output file path. -# - If the output_path is a directory, the file name will be generated from the server response. -# - If the output_path is a file, the file will be saved with the provided name. -# - If not provided, the file will be saved to the current working directory. -downloader.download( - url='https://example.com/file', # The download URL to download the file from. (required) - output_path=Path.cwd() # The path to save the downloaded file to. If the path is a directory, the file name will be generated from the server response. If the path is a file, the file will be saved with the provided name. If not provided, the file will be saved to the current working directory. (default: Path.cwd()) -) - -# All functions are documented and have detailed typings, use your development IDE to learn more. - -``` - #### `streamsnapper[merger]` ```python diff --git a/poetry.lock b/poetry.lock index baaab19..7bfdebf 100644 --- a/poetry.lock +++ b/poetry.lock @@ -555,6 +555,22 @@ files = [ {file = "tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff"}, ] +[[package]] +name = "turbodl" +version = "0.0.1" +description = "An extremely smart and efficient download manager for various cases." +optional = true +python-versions = "<4.0,>=3.9" +files = [ + {file = "turbodl-0.0.1-py3-none-any.whl", hash = "sha256:b26a84f5f91de669ce96ab083b1fa0d6d0c976114e38d38b9f43c65fa7ae0d31"}, + {file = "turbodl-0.0.1.tar.gz", hash = "sha256:d358adf9d224b57d44c73f48cbc5251cec6c2c4904aa41e43e68b62203910e8f"}, +] + +[package.dependencies] +httpx = "0.28.1" +rich = "13.9.4" +tenacity = "9.0.0" + [[package]] name = "typing-extensions" version = "4.12.2" @@ -605,12 +621,11 @@ static-analysis = ["autopep8 (>=2.0,<3.0)", "ruff (>=0.8.0,<0.9.0)"] test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] [extras] -all = ["httpx", "rich", "scrapetube", "tenacity", "yt-dlp"] -downloader = ["httpx", "rich", "tenacity"] +all = ["httpx", "scrapetube", "turbodl", "yt-dlp"] merger = [] -youtube = ["httpx", "rich", "scrapetube", "tenacity", "yt-dlp"] +youtube = ["httpx", "scrapetube", "turbodl", "yt-dlp"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "e7e80e78a46727a95fef9703e71b5a2bb1b058cfc2cbbd3070e0b70098eca86e" +content-hash = "f3a3563ebb5bdcbe0ade9c87c5dc68581d918e8ec79b2d9584926c62d85ac5b5" diff --git a/pyproject.toml b/pyproject.toml index c1244c7..dc1ae96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,9 +31,8 @@ classifiers = [ [tool.poetry.dependencies] python = "^3.9" httpx = { version = "0.28.1", optional = true } -rich = { version = "13.9.4", optional = true } scrapetube = { version = "2.5.1", optional = true } -tenacity = { version = "9.0.0", optional = true } +turbodl = { version = "0.0.1", optional = true } yt-dlp = { version = "2024.12.13", optional = true } [tool.poetry.dev-dependencies] @@ -41,7 +40,6 @@ orjson = "*" pytest = "*" [tool.poetry.extras] -downloader = ["httpx", "rich", "tenacity"] merger = [] -youtube = ["httpx", "rich", "scrapetube", "tenacity", "yt-dlp"] -all = ["httpx", "rich", "scrapetube", "tenacity", "yt-dlp"] +youtube = ["httpx", "scrapetube", "turbodl", "yt-dlp"] +all = ["httpx", "scrapetube", "turbodl", "yt-dlp"] diff --git a/requirements.txt b/requirements.txt index ba69814..ae62be6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ httpx == 0.28.1 -rich == 13.9.4 scrapetube == 2.5.1 -tenacity == 9.0.0 +turbodl == 0.0.1 yt-dlp == 2024.12.13 \ No newline at end of file diff --git a/streamsnapper/__init__.py b/streamsnapper/__init__.py index e385a66..abf25a6 100644 --- a/streamsnapper/__init__.py +++ b/streamsnapper/__init__.py @@ -2,31 +2,25 @@ from typing import List # Local imports -from .downloader import Downloader from .exceptions import ( - DownloadError, EmptyDataError, FFmpegNotFoundError, InvalidDataError, MergeError, - RequestError, ScrapingError, - StreamBaseError, + StreamSnapperError, ) from .merger import Merger from .platforms.youtube import YouTube, YouTubeExtractor __all__: List[str] = [ - 'Downloader', - 'DownloadError', 'EmptyDataError', 'FFmpegNotFoundError', 'InvalidDataError', 'MergeError', - 'RequestError', 'ScrapingError', - 'StreamBaseError', + 'StreamSnapperError', 'Merger', 'YouTube', 'YouTubeExtractor', diff --git a/streamsnapper/downloader.py b/streamsnapper/downloader.py deleted file mode 100644 index 0496570..0000000 --- a/streamsnapper/downloader.py +++ /dev/null @@ -1,321 +0,0 @@ -# Built-in imports -from concurrent.futures import ThreadPoolExecutor -from functools import lru_cache -from math import ceil -from mimetypes import guess_extension as guess_mimetype_extension -from os import PathLike -from pathlib import Path -from typing import Dict, List, Literal, Optional, Tuple, Union -from urllib.parse import unquote, urlparse - -# Third-party imports -from httpx import Client, HTTPStatusError -from rich.progress import BarColumn, DownloadColumn, Progress, TextColumn, TimeRemainingColumn, TransferSpeedColumn -from tenacity import retry, stop_after_attempt, wait_exponential - -# Local imports -from .exceptions import DownloadError, RequestError - - -class Downloader: - """A class for downloading direct download URLs.""" - - def __init__( - self, - max_connections: Union[int, Literal['auto']] = 'auto', - connection_speed: float = 80, - overwrite: bool = True, - show_progress_bar: bool = True, - custom_headers: Optional[Dict[str, str]] = None, - timeout: Optional[int] = None, - ) -> None: - """ - Initialize the Downloader class with the required settings for downloading a file. - - Args: - max_connections: The maximum number of connections to use for downloading the file. (default: 'auto') - connection_speed: The connection speed in Mbps. (default: 80) - overwrite: Overwrite the file if it already exists. Otherwise, a "_1", "_2", etc. suffix will be added. (default: True) - show_progress_bar: Show or hide the download progress bar. (default: True) - custom_headers: Custom headers to include in the request. If None, default headers will be used. Imutable headers are 'Accept-Encoding' and 'Range'. (default: None) - timeout: Timeout in seconds for the download process. Or None for no timeout. (default: None) - """ - - self._max_connections: Union[int, Literal['auto']] = max_connections - self._connection_speed: int = connection_speed - self._overwrite: bool = overwrite - self._show_progress_bar: bool = show_progress_bar - self._timeout: Optional[int] = timeout - - imutable_headers = ['Accept-Encoding', 'Range'] - - self._custom_headers: Dict[str, str] = { - 'Accept': '*/*', - 'Accept-Encoding': 'identity', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', - } - - if custom_headers: - for key, value in custom_headers.items(): - if key.title() not in imutable_headers: - self._custom_headers[key.title()] = value - - self._client: Client = Client(headers=self._custom_headers, follow_redirects=True, timeout=self._timeout) - - self.output_path: str = None - - @lru_cache() - def _calculate_connections(self, file_size: int, connection_speed: Union[float, Literal['auto']]) -> int: - """ - Calculates optimal number of connections based on file size and connection speed. - - - The connection speed ranges and recommended connections: - - | Connection Speed | Base Multiplier | - | ---------------- | --------------- | - | < 10 Mbps | 0.2x | - | 10-50 Mbps | 0.4x | - | 50-100 Mbps | 0.6x | - | 100-300 Mbps | 0.8x | - | 300-500 Mbps | 1.0x | - | > 500 Mbps | 1.2x | - - - Example outputs for different connection speeds and file sizes: - - | Connection Speed | 1MB file | 10MB file | 100MB file | 500MB file | - | ---------------- | --------- | ---------- | ----------- | ----------- | - | 10 Mbps | 1 | 3 | 6 | 13 | - | 50 Mbps | 2 | 3 | 6 | 13 | - | 100 Mbps | 2 | 5 | 10 | 19 | - | 300 Mbps | 3 | 6 | 13 | 26 | - | 500 Mbps | 4 | 8 | 16 | 32 | - | 1000 Mbps | 4 | 9 | 19 | 32 | - - Args: - file_size: The size of the file to download. (required) - connection_speed: The connection speed in Mbps. (default: 80) - - Returns: - The number of connections to use. - """ - - if self._max_connections != 'auto': - return self._max_connections - - file_size_mb = file_size / (1024 * 1024) - - if file_size_mb < 1: - base_connections = 1 - elif file_size_mb <= 5: - base_connections = 4 - elif file_size_mb <= 50: - base_connections = 8 - elif file_size_mb <= 200: - base_connections = 16 - elif file_size_mb <= 400: - base_connections = 24 - else: - base_connections = 32 - - speed = 80.0 if connection_speed == 'auto' else float(connection_speed) - - if speed < 10: - multiplier = 0.2 - elif speed <= 50: - multiplier = 0.4 - elif speed <= 100: - multiplier = 0.6 - elif speed <= 300: - multiplier = 0.8 - elif speed <= 500: - multiplier = 1.0 - else: - multiplier = 1.2 - - return max(1, min(int(base_connections * multiplier), 32)) - - @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=5), reraise=True) - def _get_file_info(self, url: str) -> Tuple[int, str, str]: - """ - Retrieve file information from a given URL. - - - This method sends a HEAD request to the specified URL to obtain the file's content length, content type, and filename. - - If the filename is not present in the 'Content-Disposition' header, it attempts to extract it from the URL path. - - If the filename cannot be determined, a default name with the appropriate extension is generated based on the content type. - - Args: - url: The URL of the file to retrieve information from. (required) - - Returns: - A tuple containing the content length (int), content type (str), and filename (str). - - Raises: - RequestError: If an error occurs while sending the HEAD request. - """ - - try: - r = self._client.head(url) - r.raise_for_status() - except HTTPStatusError as e: - raise RequestError(f'An error occurred while getting file info: {str(e)}') from e - - content_length = int(r.headers.get('content-length', 0)) - content_type = r.headers.get('content-type', 'application/octet-stream').split(';')[0] - content_disposition = r.headers.get('content-disposition') - - if content_disposition and 'filename=' in content_disposition: - filename = content_disposition.split('filename=')[-1].strip('"\'') - else: - path = unquote(urlparse(url).path) - filename = Path(path).name - - if not filename: - extension = guess_mimetype_extension(content_type) - - if extension: - filename = 'downloaded_file' + extension - - return (content_length, content_type, filename) - - def _get_chunk_ranges(self, total_size: int) -> List[Tuple[int, int]]: - """ - Calculate and return the chunk ranges for downloading a file. - - - This method divides the total file size into smaller chunks based on the number of connections calculated. - - Each chunk is represented as a tuple containing the start and end byte positions. - - Args: - total_size: The total size of the file to be downloaded. (required) - - Returns: - A list of tuples, where each tuple contains the start and end positions (in bytes) for each chunk. - If the total size is zero, returns a single chunk with both start and end as zero. - """ - - if total_size == 0: - return [(0, 0)] - - connections = self._calculate_connections(total_size, self._connection_speed) - - optimal_chunk = max(1024 * 1024, total_size // (connections * 2)) - chunk_size = min(ceil(total_size / connections), optimal_chunk) - - ranges = [] - start = 0 - - while start < total_size: - end = min(start + chunk_size - 1, total_size - 1) - ranges.append((start, end)) - start = end + 1 - - return ranges - - @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), reraise=True) - def _download_chunk(self, url: str, start: int, end: int, progress: Progress, task_id: int) -> bytes: - """ - Downloads a chunk of a file from the given URL. - - - This method sends a GET request with a 'Range' header to the specified URL to obtain the specified chunk of the file. - - The chunk is then returned as bytes. - - Args: - url: The URL to download the chunk from. (required) - start: The start byte of the chunk. (required) - end: The end byte of the chunk. (required) - progress: The Progress object to update with the chunk's size. (required) - task_id: The task ID to update in the Progress object. (required) - - Returns: - The downloaded chunk as bytes. - - Raises: - DownloadError: If an error occurs while downloading the chunk. - """ - - headers = {**self._custom_headers} - - chunk_size = min(8192, end - start + 1) - buffer = bytearray() - - if end > 0: - headers['Range'] = f'bytes={start}-{end}' - - try: - with self._client.stream('GET', url, headers=headers) as r: - r.raise_for_status() - - for chunk in r.iter_bytes(chunk_size=chunk_size): - buffer.extend(chunk) - progress.update(task_id, advance=len(chunk)) - - return bytes(buffer) - except HTTPStatusError as e: - raise DownloadError(f'An error occurred while downloading chunk: {str(e)}') from e - - def download(self, url: str, output_path: Union[str, PathLike] = Path.cwd()) -> None: - """ - Downloads a file from the provided URL to the output file path. - - - If the output_path is a directory, the file name will be generated from the server response. - - If the output_path is a file, the file will be saved with the provided name. - - If not provided, the file will be saved to the current working directory. - - Args: - url: The download URL to download the file from. (required) - output_path: The path to save the downloaded file to. If the path is a directory, the file name will be generated from the server response. If the path is a file, the file will be saved with the provided name. If not provided, the file will be saved to the current working directory. (default: Path.cwd()) - - Raises: - DownloadError: If an error occurs while downloading the file. - RequestError: If an error occurs while getting file info. - """ - - try: - total_size, mime_type, suggested_filename = self._get_file_info(url) - output_path = Path(output_path) - - if output_path.is_dir(): - output_path = Path(output_path, suggested_filename) - - if not self._overwrite: - base_name = output_path.stem - extension = output_path.suffix - counter = 1 - - while output_path.exists(): - output_path = Path(output_path.parent, f'{base_name}_{counter}{extension}') - counter += 1 - - self.output_path = output_path.as_posix() - - progress_columns = [ - TextColumn(f'Downloading a {mime_type.split("/")[0] if mime_type else "unknown"} file ({mime_type})'), - BarColumn(), - DownloadColumn(), - TransferSpeedColumn(), - TimeRemainingColumn(), - ] - - with Progress(*progress_columns, disable=not self._show_progress_bar) as progress: - task_id = progress.add_task('download', total=total_size or 100, filename=output_path.name, mime=mime_type) - - if total_size == 0: - chunk = self._download_chunk(url, 0, 0, progress, task_id) - - with Path(output_path).open('wb') as fo: - fo.write(chunk) - else: - chunks = [] - ranges = self._get_chunk_ranges(total_size) - connections = len(ranges) - - with ThreadPoolExecutor(max_workers=connections) as executor: - futures = [ - executor.submit(self._download_chunk, url, start, end, progress, task_id) for start, end in ranges - ] - chunks = [f.result() for f in futures] - - with Path(output_path).open('wb') as fo: - for chunk in chunks: - fo.write(chunk) - except Exception as e: - raise DownloadError(f'An error occurred while downloading file: {str(e)}') from e diff --git a/streamsnapper/exceptions.py b/streamsnapper/exceptions.py index f3648f1..9d7ed43 100644 --- a/streamsnapper/exceptions.py +++ b/streamsnapper/exceptions.py @@ -1,46 +1,34 @@ -class StreamBaseError(Exception): - """Base exception for StreamSnapper errors.""" +class StreamSnapperError(Exception): + """Base class for all StreamSnapper exceptions.""" pass -class DownloadError(StreamBaseError): - """Exception raised when an error occurs while downloading a file.""" - - pass - - -class EmptyDataError(StreamBaseError): +class EmptyDataError(StreamSnapperError): """Exception raised when no data is available.""" pass -class FFmpegNotFoundError(StreamBaseError): +class FFmpegNotFoundError(StreamSnapperError): """Exception raised when the FFmpeg executable is not found.""" pass -class InvalidDataError(StreamBaseError): +class InvalidDataError(StreamSnapperError): """Exception raised when invalid data is provided.""" pass -class MergeError(StreamBaseError): +class MergeError(StreamSnapperError): """Exception raised when an error occurs while merging files.""" pass -class RequestError(StreamBaseError): - """Exception raised when an error occurs while making a request.""" - - pass - - -class ScrapingError(StreamBaseError): +class ScrapingError(StreamSnapperError): """Exception raised when an error occurs while scraping data.""" pass diff --git a/streamsnapper/platforms/youtube.py b/streamsnapper/platforms/youtube.py index 6fcc298..ab11bcd 100644 --- a/streamsnapper/platforms/youtube.py +++ b/streamsnapper/platforms/youtube.py @@ -16,11 +16,11 @@ get_playlist as scrape_youtube_playlist, get_channel as scrape_youtube_channel, ) +from turbodl import TurboDL from yt_dlp import YoutubeDL from yt_dlp import utils as yt_dlp_utils # Local imports -from ..downloader import Downloader from ..exceptions import EmptyDataError, InvalidDataError, ScrapingError from ..functions import format_string, get_value from ..merger import Merger @@ -596,7 +596,7 @@ def download( tmp_path.mkdir(exist_ok=True) output_video_path = Path(tmp_path, f'.tmp-video-{self.general_info["id"]}.{video_stream["extension"]}') - video_downloader = Downloader( + video_downloader = TurboDL( max_connections=max_connections, connection_speed=connection_speed, overwrite=overwrite, @@ -606,7 +606,7 @@ def download( video_downloader.download(video_stream['url'], output_video_path) output_audio_path = Path(tmp_path, f'.tmp-audio-{self.general_info["id"]}.{audio_stream["extension"]}') - audio_downloader = Downloader( + audio_downloader = TurboDL( max_connections=max_connections, connection_speed=connection_speed, overwrite=overwrite, @@ -629,7 +629,7 @@ def download( output_path, f'{self.general_info["cleanTitle"]} [{self.general_info["id"]}].{video_stream["extension"]}' ) - downloader = Downloader( + downloader = TurboDL( max_connections=max_connections, connection_speed=connection_speed, overwrite=overwrite, @@ -645,7 +645,7 @@ def download( output_path, f'{self.general_info["cleanTitle"]} [{self.general_info["id"]}].{audio_stream["extension"]}' ) - downloader = Downloader( + downloader = TurboDL( max_connections=max_connections, connection_speed=connection_speed, overwrite=overwrite, diff --git a/tests/all.py b/tests/all.py index 14972a1..7fa666d 100644 --- a/tests/all.py +++ b/tests/all.py @@ -1,6 +1,5 @@ # Built-in imports from pathlib import Path -from random import choice from typing import List, Optional, Tuple # Third-party imports @@ -9,42 +8,23 @@ # Local imports from streamsnapper import ( - Downloader, - DownloadError, EmptyDataError, FFmpegNotFoundError, InvalidDataError, MergeError, Merger, - RequestError, ScrapingError, - StreamBaseError, + StreamSnapperError, YouTube, YouTubeExtractor, ) -class TestDownloader: - @fixture - def download_urls(self) -> List[str]: - return ['https://httpbin.org/image/png', 'https://httpbin.org/image/svg', 'https://httpbin.org/image/webp'] - - def test_file_download(self, download_urls: List[str]) -> None: - downloader: Downloader = Downloader( - max_connections='auto', connection_speed=1000, overwrite=True, show_progress_bar=True, custom_headers=None, timeout=10 - ) - - try: - downloader.download(url=choice(download_urls), output_path=Path.cwd()) - except (DownloadError, RequestError, StreamBaseError) as e: - fail(f'Something went wrong while downloading a file. Error: {e}') - - class TestMerger: def test_merger_initialization(self) -> None: try: Merger() - except (FFmpegNotFoundError, MergeError, StreamBaseError) as e: + except (FFmpegNotFoundError, MergeError, StreamSnapperError) as e: fail(f'Something went wrong while initializing the Merger class. Error: {e}') @@ -58,7 +38,7 @@ def test_video_data_extractor(self) -> None: youtube.analyze_video_streams(preferred_quality='all') youtube.analyze_audio_streams(preferred_language='local') youtube.analyze_subtitle_streams() - except (ValueError, InvalidDataError, ScrapingError, InvalidDataError, EmptyDataError, StreamBaseError) as e: + except (ValueError, InvalidDataError, ScrapingError, InvalidDataError, EmptyDataError, StreamSnapperError) as e: fail(f'Something went wrong while extracting a YouTube video data. Error: {e}') assert youtube.general_info is not None, 'General information is not available.'