From 9a26ee31bcb3e0bfa3d14b48d00a1d14d2183616 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 31 Oct 2024 14:30:01 +0000 Subject: [PATCH] Simplify scraper by dropping --type CLI argument --- .github/workflows/Tests.yml | 2 +- CHANGELOG.md | 5 ++ CONTRIBUTING.md | 2 +- README.md | 10 +-- scraper/src/youtube2zim/constants.py | 4 -- scraper/src/youtube2zim/entrypoint.py | 16 +++-- .../src/youtube2zim/playlists/entrypoint.py | 8 +-- scraper/src/youtube2zim/playlists/scraper.py | 15 ++-- scraper/src/youtube2zim/schemas.py | 1 - scraper/src/youtube2zim/scraper.py | 50 ++------------ scraper/src/youtube2zim/youtube.py | 69 ++++++++++++------- scraper/tests-integration/integration.py | 1 - zimui/cypress/fixtures/channel/channel.json | 1 - zimui/src/types/Channel.ts | 7 -- 14 files changed, 81 insertions(+), 110 deletions(-) diff --git a/.github/workflows/Tests.yml b/.github/workflows/Tests.yml index 5f5c21d8..df620f32 100644 --- a/.github/workflows/Tests.yml +++ b/.github/workflows/Tests.yml @@ -100,7 +100,7 @@ jobs: env: YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }} OPTIMIZATION_CACHE_URL: ${{ secrets.OPTIMIZATION_CACHE_URL }} - run: docker run -v $PWD/output:/output youtube2zim youtube2zim --api-key "$YOUTUBE_API_KEY" --optimization-cache "$OPTIMIZATION_CACHE_URL" --type channel --id "UC8elThf5TGMpQfQc_VE917Q" --name "tests_en_openzim-testing" --zim-file "openZIM_testing.zim" --tags "tEsTing,x-mark:yes" + run: docker run -v $PWD/output:/output youtube2zim youtube2zim --api-key "$YOUTUBE_API_KEY" --optimization-cache "$OPTIMIZATION_CACHE_URL" --id "UC8elThf5TGMpQfQc_VE917Q" --name "tests_en_openzim-testing" --zim-file "openZIM_testing.zim" --tags "tEsTing,x-mark:yes" - name: Run integration test suite run: docker run -v $PWD/scraper/tests-integration/integration.py:/src/scraper/tests-integration/integration.py -v $PWD/output:/output youtube2zim bash -c "pip install pytest; pytest -v /src/scraper/tests-integration/integration.py" diff --git a/CHANGELOG.md b/CHANGELOG.md index dd71e3b2..6dc8adc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Deprecated + +- `--type` CLI argument is now deprecated (will be removed in next major) + ### Changed - Raise exception if there are no videos in the playlists (#347) +- Drop `--type` CLI argument and guess `--id` type (#361) ### Fixed diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5543d444..d4cfe36c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -66,7 +66,7 @@ docker build -t local-youtube2zim . Scrape a channel (here we use the [openZIM_testing](https://www.youtube.com/channel/UC8elThf5TGMpQfQc_VE917Q) channel, but you could use any other one of interest for your UI developments). ``` -docker run --rm -it -v "$PWD/output":/output local-youtube2zim youtube2zim --api-key --type channel --id "UC8elThf5TGMpQfQc_VE917Q" --name "openZIM_testing" --zim-file "openZIM_testing" +docker run --rm -it -v "$PWD/output":/output local-youtube2zim youtube2zim --api-key --id "UC8elThf5TGMpQfQc_VE917Q" --name "openZIM_testing" --zim-file "openZIM_testing" ``` Extract interesting ZIM content and move it to `public` folder. diff --git a/README.md b/README.md index b63777dc..a11cf190 100644 --- a/README.md +++ b/README.md @@ -78,18 +78,18 @@ To get an API Key: You can then create a ZIM from a singe channel / user / handle like `Vsauce`: ```bash -youtube2zim --api-key "" --type channel --id "Vsauce" --name "tests_hi_avanti" +youtube2zim --api-key "" --id "Vsauce" --name "tests_hi_avanti" ``` -When `--type channel` is used, you must pass one single value in `--id` and it can be the channel, user or playlist, or even the corresponding technical ID (see [FAQ/FEE](https://github.com/openzim/youtube/wiki/FAQ---FEE) for more details). +When scraping a channel, you must pass one single value in `--id` and it can be the handle, user, or even the corresponding technical ID (see [FAQ/FEE](https://github.com/openzim/youtube/wiki/FAQ---FEE) for more details). Or you can create a ZIM from two playlists like `PL3rEvTTL-Jm8cBdskZoQaDTlDT4t7F6kp` and `PL3rEvTTL-Jm_OuyYpMfxtJW3Mcr9fFS2Z`: ```bash -youtube2zim --api-key "" --type playlist --id "PL3rEvTTL-Jm8cBdskZoQaDTlDT4t7F6kp,PL3rEvTTL-Jm_OuyYpMfxtJW3Mcr9fFS2Z" --name "tests_hi_avanti" +youtube2zim --api-key "" --id "PL3rEvTTL-Jm8cBdskZoQaDTlDT4t7F6kp,PL3rEvTTL-Jm_OuyYpMfxtJW3Mcr9fFS2Z" --name "tests_hi_avanti" ``` -When `--type playlist` is used, you can pass multiple playlist IDs separated by a comma in `--id`. +When scraping playlists, you can pass multiple playlist IDs separated by a comma in `--id`. For more details / advanced usage, see the [Manual](https://github.com/openzim/youtube/wiki/Manual). @@ -110,7 +110,7 @@ This script is a wrapper around `youtube2zim` and is bundled with the main packa Sample usage: ``` -youtube2zim-playlists --indiv-playlists --api-key XXX --type channel --id Vsauce --playlists-name="vsauce_en_playlist-{playlist_id}" +youtube2zim-playlists --indiv-playlists --api-key XXX --id Vsauce --playlists-name="vsauce_en_playlist-{playlist_id}" ``` Those are the required arguments for `youtube2zim-playlists` but **you can also pass any regular `youtube2zim` argument**. Those will be forwarded to `youtube2zim` (which will be run independently for each playlist). diff --git a/scraper/src/youtube2zim/constants.py b/scraper/src/youtube2zim/constants.py index 67ed9c16..1fed8030 100644 --- a/scraper/src/youtube2zim/constants.py +++ b/scraper/src/youtube2zim/constants.py @@ -13,10 +13,6 @@ SCRAPER = f"{NAME} {__version__}" -CHANNEL = "channel" -PLAYLIST = "playlist" -USER = "user" - # Youtube uses some non-standard language codes YOUTUBE_LANG_MAP = { "iw": "he", # Hebrew diff --git a/scraper/src/youtube2zim/entrypoint.py b/scraper/src/youtube2zim/entrypoint.py index b925a3f0..39ab113d 100755 --- a/scraper/src/youtube2zim/entrypoint.py +++ b/scraper/src/youtube2zim/entrypoint.py @@ -6,7 +6,7 @@ import os import sys -from youtube2zim.constants import CHANNEL, NAME, PLAYLIST, SCRAPER, USER, logger +from youtube2zim.constants import NAME, SCRAPER, logger from youtube2zim.scraper import Youtube2Zim @@ -16,12 +16,12 @@ def main(): description="Scraper to create a ZIM file from a Youtube Channel or Playlists", ) + # Not used anymore, kept for backward compability till next major release + # Also remove trick lines 211-217 to not handle this anymore parser.add_argument( "--type", help="Type of collection", - choices=[CHANNEL, PLAYLIST, USER], - required=True, - dest="collection_type", + dest="not_used_anymore", ) parser.add_argument( "--id", help="Youtube ID of the collection", required=True, dest="youtube_id" @@ -208,7 +208,13 @@ def main(): try: if args.max_concurrency < 1: raise ValueError(f"Invalid concurrency value: {args.max_concurrency}") - scraper = Youtube2Zim(**dict(args._get_kwargs())) + scraper = Youtube2Zim( + **{ + key: value + for key, value in dict(args._get_kwargs()).items() + if key != "not_used_anymore" + } + ) return scraper.run() except Exception as exc: logger.error(f"FAILED. An error occurred: {exc}") diff --git a/scraper/src/youtube2zim/playlists/entrypoint.py b/scraper/src/youtube2zim/playlists/entrypoint.py index bb7b9561..97e8a486 100644 --- a/scraper/src/youtube2zim/playlists/entrypoint.py +++ b/scraper/src/youtube2zim/playlists/entrypoint.py @@ -5,7 +5,7 @@ import logging import sys -from youtube2zim.constants import CHANNEL, NAME, PLAYLIST, SCRAPER, USER, logger +from youtube2zim.constants import NAME, SCRAPER, logger from youtube2zim.utils import has_argument @@ -19,13 +19,13 @@ def main(): "{creator_id}, {creator_name}.", ) + # Not used anymore, kept for backward compability till next major release parser.add_argument( "--type", help="Type of collection", - choices=[CHANNEL, PLAYLIST, USER], - required=True, - dest="collection_type", + dest="not_used_anymore", ) + parser.add_argument( "--id", help="Youtube ID of the collection", required=True, dest="youtube_id" ) diff --git a/scraper/src/youtube2zim/playlists/scraper.py b/scraper/src/youtube2zim/playlists/scraper.py index db885cd8..1be64081 100644 --- a/scraper/src/youtube2zim/playlists/scraper.py +++ b/scraper/src/youtube2zim/playlists/scraper.py @@ -21,7 +21,7 @@ import requests from zimscraperlib.logging import nicer_args_join -from youtube2zim.constants import NAME, PLAYLIST, YOUTUBE, logger +from youtube2zim.constants import NAME, YOUTUBE, logger from youtube2zim.youtube import ( REQUEST_TIMEOUT, credentials_ok, @@ -40,7 +40,6 @@ def __init__( self.debug = options["debug"] self.disable_metadata_checks = options["disable_metadata_checks"] self.playlists_mode = options["playlists_mode"] - self.collection_type = options["collection_type"] self.youtube_id = options["youtube_id"] self.extra_args = extra_args @@ -76,10 +75,7 @@ def run(self): shutil.rmtree(self.build_dir, ignore_errors=True) # not needed return self.handle_single_zim() - logger.info( - f"starting all-playlits {NAME} scraper " - f"for {self.collection_type}#{self.youtube_id}" - ) + logger.info(f"starting all-playlists {NAME} scraper for {self.youtube_id}") # create required sub folders for sub_folder in ("cache", "videos", "channels"): @@ -96,7 +92,8 @@ def run(self): playlists, main_channel_id, uploads_playlist_id, - ) = extract_playlists_details_from(self.collection_type, self.youtube_id) + is_playlist, + ) = extract_playlists_details_from(self.youtube_id) logger.info( ".. {} playlists:\n {}".format( @@ -128,8 +125,6 @@ def run_playlist_zim(self, playlist): playlist_id = playlist.playlist_id args = [ *self.youtube2zim_exe, - "--type", - PLAYLIST, "--id", playlist_id, "--api-key", @@ -180,8 +175,6 @@ def handle_single_zim(self): args = [ *self.youtube2zim_exe, - "--type", - self.collection_type, "--id", self.youtube_id, "--api-key", diff --git a/scraper/src/youtube2zim/schemas.py b/scraper/src/youtube2zim/schemas.py index f7d390a7..4748eb3c 100644 --- a/scraper/src/youtube2zim/schemas.py +++ b/scraper/src/youtube2zim/schemas.py @@ -105,7 +105,6 @@ class Channel(CamelModel): profile_path: str | None = None banner_path: str | None = None joined_date: str - collection_type: str main_playlist: str | None = None playlist_count: int diff --git a/scraper/src/youtube2zim/scraper.py b/scraper/src/youtube2zim/scraper.py index b1e94b3f..805de89d 100644 --- a/scraper/src/youtube2zim/scraper.py +++ b/scraper/src/youtube2zim/scraper.py @@ -43,11 +43,8 @@ ) from youtube2zim.constants import ( - CHANNEL, - PLAYLIST, ROOT_DIR, SCRAPER, - USER, YOUTUBE, YOUTUBE_LANG_MAP, logger, @@ -86,13 +83,10 @@ skip_outofrange_videos, ) -MAXIMUM_YOUTUBEID_LENGTH = 24 - class Youtube2Zim: def __init__( self, - collection_type, youtube_id, api_key, video_format, @@ -124,13 +118,6 @@ def __init__( secondary_color=None, ): # data-retrieval info - self.collection_type = collection_type - if self.collection_type == USER: - logger.warning( - "Collection type 'user' is deprecated. Please use 'channel' type," - " behaviors have been merged. 'user' type is going to be dropped in " - " next major release" - ) self.youtube_id = youtube_id self.api_key = api_key self.dateafter = dateafter @@ -233,23 +220,9 @@ def profile_path(self): def banner_path(self): return self.build_dir.joinpath("banner.jpg") - @property - def is_user(self): - return self.collection_type == USER - - @property - def is_channel(self): - return self.collection_type == CHANNEL - - @property - def is_playlist(self): - return self.collection_type == PLAYLIST - @property def is_single_channel(self): - if self.is_channel or self.is_user: - return True - return len(list({pl.creator_id for pl in self.playlists})) == 1 + return len({pl.creator_id for pl in self.playlists}) == 1 @property def sorted_playlists(self): @@ -282,8 +255,6 @@ def run(self): # first report => creates a file with appropriate structure self.report_progress() - self.validate_id() - # validate dateafter input self.validate_dateafter_input() @@ -303,9 +274,7 @@ def run(self): if not self.build_dir.exists() or not self.build_dir.is_dir(): raise OSError(f"Incorrect build_dir: {self.build_dir}") - logger.info( - f"starting youtube scraper for {self.collection_type}#{self.youtube_id}" - ) + logger.info(f"starting youtube scraper for {self.youtube_id}") logger.info(f"preparing build folder at {self.build_dir.resolve()}") self.prepare_build_folder() @@ -497,17 +466,6 @@ def validate_dateafter_input(self): ) raise ValueError(f"Invalid dateafter input: {exc}") from exc - def validate_id(self): - # space not allowed in youtube-ID - self.youtube_id = self.youtube_id.replace(" ", "") - if ( - self.collection_type == "channel" - and len(self.youtube_id) > MAXIMUM_YOUTUBEID_LENGTH - ): - raise ValueError("Invalid ChannelId") - if "," in self.youtube_id and self.collection_type != "playlist": - raise ValueError("Invalid YoutubeId") - def prepare_build_folder(self): """prepare build folder before we start downloading data""" @@ -590,7 +548,8 @@ def extract_playlists(self): self.playlists, self.main_channel_id, self.uploads_playlist_id, - ) = extract_playlists_details_from(self.collection_type, self.youtube_id) + self.is_playlist, + ) = extract_playlists_details_from(self.youtube_id) def extract_videos_list(self): all_videos = load_json(self.cache_dir, "videos") @@ -1262,7 +1221,6 @@ def get_playlist_slug(playlist) -> str: channel_description=channel_data["snippet"]["description"], profile_path="profile.jpg", banner_path="banner.jpg", - collection_type=self.collection_type, main_playlist=main_playlist_slug, playlist_count=len(self.playlists), joined_date=channel_data["snippet"]["publishedAt"], diff --git a/scraper/src/youtube2zim/youtube.py b/scraper/src/youtube2zim/youtube.py index 8eb17b09..fd565f5b 100644 --- a/scraper/src/youtube2zim/youtube.py +++ b/scraper/src/youtube2zim/youtube.py @@ -8,7 +8,7 @@ from zimscraperlib.download import stream_file from zimscraperlib.image.transformation import resize_image -from youtube2zim.constants import CHANNEL, PLAYLIST, USER, YOUTUBE, logger +from youtube2zim.constants import YOUTUBE, logger from youtube2zim.utils import get_slug, load_json, save_json YOUTUBE_API = "https://www.googleapis.com/youtube/v3" @@ -23,6 +23,18 @@ REQUEST_TIMEOUT = 60 +class ChannelNotFoundError(Exception): + """Exception raise when requested channel is not found""" + + pass + + +class PlaylistNotFoundError(Exception): + """Exception raise when requested playlist is not found""" + + pass + + class Playlist: def __init__( self, @@ -104,8 +116,9 @@ def get_channel_json(channel_id): logger.warning(f"Failed to find {channel_id} by {criteria}") continue channel_json = req_json["items"][0] + break if channel_json is None: - raise Exception(f"Impossible to find {channel_id}, check for typos") + raise ChannelNotFoundError(f"Invalid channel ID `{channel_id}`: Not Found") save_json(YOUTUBE.cache_dir, fname, channel_json) return channel_json @@ -164,8 +177,9 @@ def get_playlist_json(playlist_id): try: playlist_json = req.json()["items"][0] except IndexError: - logger.error(f"Invalid playlistId `{playlist_id}`: Not Found") - raise + raise PlaylistNotFoundError( + f"Invalid playlistId `{playlist_id}`: Not Found" + ) from None save_json(YOUTUBE.cache_dir, fname, playlist_json) return playlist_json @@ -319,34 +333,43 @@ def skip_outofrange_videos(date_range, item): return dt_parser.parse(item["snippet"]["publishedAt"]).date() in date_range -def extract_playlists_details_from(collection_type, youtube_id): - """prepare a list of Playlist from user request - - USER: we fetch the hidden channel associate to it - CHANNEL (and USER): we grab all playlists + `uploads` playlist - PLAYLIST: we retrieve from the playlist Id(s)""" +def extract_playlists_details_from(youtube_id: str): + """prepare a list of Playlist from user request""" uploads_playlist_id = None main_channel_id = None - if collection_type in (USER, CHANNEL): - # get_channel_json is capable to retrieve user and channel - channel_json = get_channel_json(youtube_id) - main_channel_id = channel_json["id"] - - # retrieve list of playlists for that channel - playlist_ids = [p["id"] for p in get_channel_playlists_json(main_channel_id)] - # we always include uploads playlist (contains everything) - playlist_ids += [channel_json["contentDetails"]["relatedPlaylists"]["uploads"]] - uploads_playlist_id = playlist_ids[-1] - elif collection_type == PLAYLIST: + if "," not in youtube_id: + try: + # first try to consider passed ID is a channel ID (or username or handle) + channel_json = get_channel_json(youtube_id) + main_channel_id = channel_json["id"] + # retrieve list of playlists for that channel + playlist_ids = [ + p["id"] for p in get_channel_playlists_json(main_channel_id) + ] + # we always include uploads playlist (contains everything) + playlist_ids += [ + channel_json["contentDetails"]["relatedPlaylists"]["uploads"] + ] + uploads_playlist_id = playlist_ids[-1] + is_playlist = False + except ChannelNotFoundError: + # channel not found, then ID should be a playlist + playlist_ids = [youtube_id] + main_channel_id = Playlist.from_id(youtube_id).creator_id + is_playlist = True + else: + # only playlists are supported in CSV ; let's grab all playlists info + # (intentionally, to check they are all ok) and use channel of first playlist as + # main channel ID playlist_ids = youtube_id.split(",") main_channel_id = Playlist.from_id(playlist_ids[0]).creator_id - else: - raise NotImplementedError("unsupported collection_type") + is_playlist = True return ( # dict.fromkeys maintains the order of playlist_ids while removing duplicates [Playlist.from_id(playlist_id) for playlist_id in dict.fromkeys(playlist_ids)], main_channel_id, uploads_playlist_id, + is_playlist, ) diff --git a/scraper/tests-integration/integration.py b/scraper/tests-integration/integration.py index bf6ca6ad..8241a565 100644 --- a/scraper/tests-integration/integration.py +++ b/scraper/tests-integration/integration.py @@ -47,7 +47,6 @@ def test_zim_channel_json(): assert channel_json["id"] == "UC8elThf5TGMpQfQc_VE917Q" assert channel_json["channelName"] == "openZIM_testing" - assert channel_json["collectionType"] == "channel" assert channel_json["mainPlaylist"] == "uploads_from_openzim_testing-917Q" diff --git a/zimui/cypress/fixtures/channel/channel.json b/zimui/cypress/fixtures/channel/channel.json index daf09514..19696e8b 100644 --- a/zimui/cypress/fixtures/channel/channel.json +++ b/zimui/cypress/fixtures/channel/channel.json @@ -7,6 +7,5 @@ "profilePath": "profile.jpg", "bannerPath": "banner.jpg", "joinedDate": "2024-06-04T13:30:16.232286Z", - "collectionType": "channel", "mainPlaylist": "uploads_from_openzim_testing-917Q" } diff --git a/zimui/src/types/Channel.ts b/zimui/src/types/Channel.ts index 4cfa8a8d..21039f27 100644 --- a/zimui/src/types/Channel.ts +++ b/zimui/src/types/Channel.ts @@ -7,7 +7,6 @@ export interface Channel { profilePath?: string bannerPath?: string joinedDate: string - collectionType: string mainPlaylist?: string playlistCount: number } @@ -25,9 +24,3 @@ export interface Author { profilePath?: string bannerPath?: string } - -export enum CollectionType { - Playlist = 'playlist', - Video = 'video', - Channel = 'channel' -}