From 9a26ee31bcb3e0bfa3d14b48d00a1d14d2183616 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 31 Oct 2024 14:30:01 +0000
Subject: [PATCH] Simplify scraper by dropping --type CLI argument

---
 .github/workflows/Tests.yml                   |  2 +-
 CHANGELOG.md                                  |  5 ++
 CONTRIBUTING.md                               |  2 +-
 README.md                                     | 10 +--
 scraper/src/youtube2zim/constants.py          |  4 --
 scraper/src/youtube2zim/entrypoint.py         | 16 +++--
 .../src/youtube2zim/playlists/entrypoint.py   |  8 +--
 scraper/src/youtube2zim/playlists/scraper.py  | 15 ++--
 scraper/src/youtube2zim/schemas.py            |  1 -
 scraper/src/youtube2zim/scraper.py            | 50 ++------------
 scraper/src/youtube2zim/youtube.py            | 69 ++++++++++++-------
 scraper/tests-integration/integration.py      |  1 -
 zimui/cypress/fixtures/channel/channel.json   |  1 -
 zimui/src/types/Channel.ts                    |  7 --
 14 files changed, 81 insertions(+), 110 deletions(-)
diff --git a/.github/workflows/Tests.yml b/.github/workflows/Tests.yml
index 5f5c21d8..df620f32 100644
--- a/.github/workflows/Tests.yml
+++ b/.github/workflows/Tests.yml
@@ -100,7 +100,7 @@ jobs:
         env:
           YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
           OPTIMIZATION_CACHE_URL: ${{ secrets.OPTIMIZATION_CACHE_URL }}
-        run: docker run -v $PWD/output:/output youtube2zim youtube2zim --api-key "$YOUTUBE_API_KEY" --optimization-cache "$OPTIMIZATION_CACHE_URL" --type channel --id "UC8elThf5TGMpQfQc_VE917Q" --name "tests_en_openzim-testing" --zim-file "openZIM_testing.zim" --tags "tEsTing,x-mark:yes"
+        run: docker run -v $PWD/output:/output youtube2zim youtube2zim --api-key "$YOUTUBE_API_KEY" --optimization-cache "$OPTIMIZATION_CACHE_URL" --id "UC8elThf5TGMpQfQc_VE917Q" --name "tests_en_openzim-testing" --zim-file "openZIM_testing.zim" --tags "tEsTing,x-mark:yes"
 
       - name: Run integration test suite
         run: docker run -v $PWD/scraper/tests-integration/integration.py:/src/scraper/tests-integration/integration.py -v $PWD/output:/output youtube2zim bash -c "pip install pytest; pytest -v /src/scraper/tests-integration/integration.py"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index dd71e3b2..6dc8adc2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,9 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Deprecated
+
+- `--type` CLI argument is now deprecated (will be removed in next major)
+
 ### Changed
 
 - Raise exception if there are no videos in the playlists (#347)
+- Drop `--type` CLI argument and guess `--id` type (#361)
 
 ### Fixed
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5543d444..d4cfe36c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -66,7 +66,7 @@ docker build -t local-youtube2zim .
 Scrape a channel (here we use the [openZIM_testing](https://www.youtube.com/channel/UC8elThf5TGMpQfQc_VE917Q) channel, but you could use any other one of interest for your UI developments).
 
 ```
-docker run --rm -it -v "$PWD/output":/output local-youtube2zim youtube2zim --api-key <YOUR-API-KEY> --type channel --id "UC8elThf5TGMpQfQc_VE917Q" --name "openZIM_testing" --zim-file "openZIM_testing"
+docker run --rm -it -v "$PWD/output":/output local-youtube2zim youtube2zim --api-key <YOUR-API-KEY> --id "UC8elThf5TGMpQfQc_VE917Q" --name "openZIM_testing" --zim-file "openZIM_testing"
 ```
 
 Extract interesting ZIM content and move it to `public` folder.
diff --git a/README.md b/README.md
index b63777dc..a11cf190 100644
--- a/README.md
+++ b/README.md
@@ -78,18 +78,18 @@ To get an API Key:
 You can then create a ZIM from a singe channel / user / handle like `Vsauce`:
 
 ```bash
-youtube2zim --api-key "<your-api-key>" --type channel --id "Vsauce" --name "tests_hi_avanti"
+youtube2zim --api-key "<your-api-key>" --id "Vsauce" --name "tests_hi_avanti"
 ```
 
-When `--type channel` is used, you must pass one single value in `--id` and it can be the channel, user or playlist, or even the corresponding technical ID (see [FAQ/FEE](https://github.com/openzim/youtube/wiki/FAQ---FEE) for more details).
+When scraping a channel, you must pass one single value in `--id` and it can be the handle, user, or even the corresponding technical ID (see [FAQ/FEE](https://github.com/openzim/youtube/wiki/FAQ---FEE) for more details).
 
 Or you can create a ZIM from two playlists like `PL3rEvTTL-Jm8cBdskZoQaDTlDT4t7F6kp` and `PL3rEvTTL-Jm_OuyYpMfxtJW3Mcr9fFS2Z`:
 
 ```bash
-youtube2zim --api-key "<your-api-key>" --type playlist --id "PL3rEvTTL-Jm8cBdskZoQaDTlDT4t7F6kp,PL3rEvTTL-Jm_OuyYpMfxtJW3Mcr9fFS2Z" --name "tests_hi_avanti"
+youtube2zim --api-key "<your-api-key>" --id "PL3rEvTTL-Jm8cBdskZoQaDTlDT4t7F6kp,PL3rEvTTL-Jm_OuyYpMfxtJW3Mcr9fFS2Z" --name "tests_hi_avanti"
 ```
 
-When `--type playlist` is used, you can pass multiple playlist IDs separated by a comma in `--id`.
+When scraping playlists, you can pass multiple playlist IDs separated by a comma in `--id`.
 
 For more details / advanced usage, see the [Manual](https://github.com/openzim/youtube/wiki/Manual).
 
@@ -110,7 +110,7 @@ This script is a wrapper around `youtube2zim` and is bundled with the main packa
 Sample usage:
 
 ```
-youtube2zim-playlists --indiv-playlists --api-key XXX --type channel --id Vsauce --playlists-name="vsauce_en_playlist-{playlist_id}"
+youtube2zim-playlists --indiv-playlists --api-key XXX --id Vsauce --playlists-name="vsauce_en_playlist-{playlist_id}"
 ```
 
 Those are the required arguments for `youtube2zim-playlists` but **you can also pass any regular `youtube2zim` argument**. Those will be forwarded to `youtube2zim` (which will be run independently for each playlist).
diff --git a/scraper/src/youtube2zim/constants.py b/scraper/src/youtube2zim/constants.py
index 67ed9c16..1fed8030 100644
--- a/scraper/src/youtube2zim/constants.py
+++ b/scraper/src/youtube2zim/constants.py
@@ -13,10 +13,6 @@
 
 SCRAPER = f"{NAME} {__version__}"
 
-CHANNEL = "channel"
-PLAYLIST = "playlist"
-USER = "user"
-
 # Youtube uses some non-standard language codes
 YOUTUBE_LANG_MAP = {
     "iw": "he",  # Hebrew
diff --git a/scraper/src/youtube2zim/entrypoint.py b/scraper/src/youtube2zim/entrypoint.py
index b925a3f0..39ab113d 100755
--- a/scraper/src/youtube2zim/entrypoint.py
+++ b/scraper/src/youtube2zim/entrypoint.py
@@ -6,7 +6,7 @@
 import os
 import sys
 
-from youtube2zim.constants import CHANNEL, NAME, PLAYLIST, SCRAPER, USER, logger
+from youtube2zim.constants import NAME, SCRAPER, logger
 from youtube2zim.scraper import Youtube2Zim
 
 
@@ -16,12 +16,12 @@ def main():
         description="Scraper to create a ZIM file from a Youtube Channel or Playlists",
     )
 
+    # Not used anymore, kept for backward compability till next major release
+    # Also remove trick lines 211-217 to not handle this anymore
     parser.add_argument(
         "--type",
         help="Type of collection",
-        choices=[CHANNEL, PLAYLIST, USER],
-        required=True,
-        dest="collection_type",
+        dest="not_used_anymore",
     )
     parser.add_argument(
         "--id", help="Youtube ID of the collection", required=True, dest="youtube_id"
@@ -208,7 +208,13 @@ def main():
     try:
         if args.max_concurrency < 1:
             raise ValueError(f"Invalid concurrency value: {args.max_concurrency}")
-        scraper = Youtube2Zim(**dict(args._get_kwargs()))
+        scraper = Youtube2Zim(
+            **{
+                key: value
+                for key, value in dict(args._get_kwargs()).items()
+                if key != "not_used_anymore"
+            }
+        )
         return scraper.run()
     except Exception as exc:
         logger.error(f"FAILED. An error occurred: {exc}")
diff --git a/scraper/src/youtube2zim/playlists/entrypoint.py b/scraper/src/youtube2zim/playlists/entrypoint.py
index bb7b9561..97e8a486 100644
--- a/scraper/src/youtube2zim/playlists/entrypoint.py
+++ b/scraper/src/youtube2zim/playlists/entrypoint.py
@@ -5,7 +5,7 @@
 import logging
 import sys
 
-from youtube2zim.constants import CHANNEL, NAME, PLAYLIST, SCRAPER, USER, logger
+from youtube2zim.constants import NAME, SCRAPER, logger
 from youtube2zim.utils import has_argument
 
 
@@ -19,13 +19,13 @@ def main():
         "{creator_id}, {creator_name}.",
     )
 
+    # Not used anymore, kept for backward compability till next major release
     parser.add_argument(
         "--type",
         help="Type of collection",
-        choices=[CHANNEL, PLAYLIST, USER],
-        required=True,
-        dest="collection_type",
+        dest="not_used_anymore",
     )
+
     parser.add_argument(
         "--id", help="Youtube ID of the collection", required=True, dest="youtube_id"
     )
diff --git a/scraper/src/youtube2zim/playlists/scraper.py b/scraper/src/youtube2zim/playlists/scraper.py
index db885cd8..1be64081 100644
--- a/scraper/src/youtube2zim/playlists/scraper.py
+++ b/scraper/src/youtube2zim/playlists/scraper.py
@@ -21,7 +21,7 @@
 import requests
 from zimscraperlib.logging import nicer_args_join
 
-from youtube2zim.constants import NAME, PLAYLIST, YOUTUBE, logger
+from youtube2zim.constants import NAME, YOUTUBE, logger
 from youtube2zim.youtube import (
     REQUEST_TIMEOUT,
     credentials_ok,
@@ -40,7 +40,6 @@ def __init__(
         self.debug = options["debug"]
         self.disable_metadata_checks = options["disable_metadata_checks"]
         self.playlists_mode = options["playlists_mode"]
-        self.collection_type = options["collection_type"]
         self.youtube_id = options["youtube_id"]
 
         self.extra_args = extra_args
@@ -76,10 +75,7 @@ def run(self):
             shutil.rmtree(self.build_dir, ignore_errors=True)  # not needed
             return self.handle_single_zim()
 
-        logger.info(
-            f"starting all-playlits {NAME} scraper "
-            f"for {self.collection_type}#{self.youtube_id}"
-        )
+        logger.info(f"starting all-playlists {NAME} scraper for {self.youtube_id}")
 
         # create required sub folders
         for sub_folder in ("cache", "videos", "channels"):
@@ -96,7 +92,8 @@ def run(self):
             playlists,
             main_channel_id,
             uploads_playlist_id,
-        ) = extract_playlists_details_from(self.collection_type, self.youtube_id)
+            is_playlist,
+        ) = extract_playlists_details_from(self.youtube_id)
 
         logger.info(
             ".. {} playlists:\n   {}".format(
@@ -128,8 +125,6 @@ def run_playlist_zim(self, playlist):
         playlist_id = playlist.playlist_id
         args = [
             *self.youtube2zim_exe,
-            "--type",
-            PLAYLIST,
             "--id",
             playlist_id,
             "--api-key",
@@ -180,8 +175,6 @@ def handle_single_zim(self):
 
         args = [
             *self.youtube2zim_exe,
-            "--type",
-            self.collection_type,
             "--id",
             self.youtube_id,
             "--api-key",
diff --git a/scraper/src/youtube2zim/schemas.py b/scraper/src/youtube2zim/schemas.py
index f7d390a7..4748eb3c 100644
--- a/scraper/src/youtube2zim/schemas.py
+++ b/scraper/src/youtube2zim/schemas.py
@@ -105,7 +105,6 @@ class Channel(CamelModel):
     profile_path: str | None = None
     banner_path: str | None = None
     joined_date: str
-    collection_type: str
     main_playlist: str | None = None
     playlist_count: int
 
diff --git a/scraper/src/youtube2zim/scraper.py b/scraper/src/youtube2zim/scraper.py
index b1e94b3f..805de89d 100644
--- a/scraper/src/youtube2zim/scraper.py
+++ b/scraper/src/youtube2zim/scraper.py
@@ -43,11 +43,8 @@
 )
 
 from youtube2zim.constants import (
-    CHANNEL,
-    PLAYLIST,
     ROOT_DIR,
     SCRAPER,
-    USER,
     YOUTUBE,
     YOUTUBE_LANG_MAP,
     logger,
@@ -86,13 +83,10 @@
     skip_outofrange_videos,
 )
 
-MAXIMUM_YOUTUBEID_LENGTH = 24
-
 
 class Youtube2Zim:
     def __init__(
         self,
-        collection_type,
         youtube_id,
         api_key,
         video_format,
@@ -124,13 +118,6 @@ def __init__(
         secondary_color=None,
     ):
         # data-retrieval info
-        self.collection_type = collection_type
-        if self.collection_type == USER:
-            logger.warning(
-                "Collection type 'user' is deprecated. Please use 'channel' type,"
-                " behaviors have been merged. 'user' type is going to be dropped in "
-                " next major release"
-            )
         self.youtube_id = youtube_id
         self.api_key = api_key
         self.dateafter = dateafter
@@ -233,23 +220,9 @@ def profile_path(self):
     def banner_path(self):
         return self.build_dir.joinpath("banner.jpg")
 
-    @property
-    def is_user(self):
-        return self.collection_type == USER
-
-    @property
-    def is_channel(self):
-        return self.collection_type == CHANNEL
-
-    @property
-    def is_playlist(self):
-        return self.collection_type == PLAYLIST
-
     @property
     def is_single_channel(self):
-        if self.is_channel or self.is_user:
-            return True
-        return len(list({pl.creator_id for pl in self.playlists})) == 1
+        return len({pl.creator_id for pl in self.playlists}) == 1
 
     @property
     def sorted_playlists(self):
@@ -282,8 +255,6 @@ def run(self):
             # first report => creates a file with appropriate structure
             self.report_progress()
 
-            self.validate_id()
-
             # validate dateafter input
             self.validate_dateafter_input()
 
@@ -303,9 +274,7 @@ def run(self):
             if not self.build_dir.exists() or not self.build_dir.is_dir():
                 raise OSError(f"Incorrect build_dir: {self.build_dir}")
 
-            logger.info(
-                f"starting youtube scraper for {self.collection_type}#{self.youtube_id}"
-            )
+            logger.info(f"starting youtube scraper for {self.youtube_id}")
             logger.info(f"preparing build folder at {self.build_dir.resolve()}")
             self.prepare_build_folder()
 
@@ -497,17 +466,6 @@ def validate_dateafter_input(self):
             )
             raise ValueError(f"Invalid dateafter input: {exc}") from exc
 
-    def validate_id(self):
-        # space not allowed in youtube-ID
-        self.youtube_id = self.youtube_id.replace(" ", "")
-        if (
-            self.collection_type == "channel"
-            and len(self.youtube_id) > MAXIMUM_YOUTUBEID_LENGTH
-        ):
-            raise ValueError("Invalid ChannelId")
-        if "," in self.youtube_id and self.collection_type != "playlist":
-            raise ValueError("Invalid YoutubeId")
-
     def prepare_build_folder(self):
         """prepare build folder before we start downloading data"""
 
@@ -590,7 +548,8 @@ def extract_playlists(self):
             self.playlists,
             self.main_channel_id,
             self.uploads_playlist_id,
-        ) = extract_playlists_details_from(self.collection_type, self.youtube_id)
+            self.is_playlist,
+        ) = extract_playlists_details_from(self.youtube_id)
 
     def extract_videos_list(self):
         all_videos = load_json(self.cache_dir, "videos")
@@ -1262,7 +1221,6 @@ def get_playlist_slug(playlist) -> str:
                 channel_description=channel_data["snippet"]["description"],
                 profile_path="profile.jpg",
                 banner_path="banner.jpg",
-                collection_type=self.collection_type,
                 main_playlist=main_playlist_slug,
                 playlist_count=len(self.playlists),
                 joined_date=channel_data["snippet"]["publishedAt"],
diff --git a/scraper/src/youtube2zim/youtube.py b/scraper/src/youtube2zim/youtube.py
index 8eb17b09..fd565f5b 100644
--- a/scraper/src/youtube2zim/youtube.py
+++ b/scraper/src/youtube2zim/youtube.py
@@ -8,7 +8,7 @@
 from zimscraperlib.download import stream_file
 from zimscraperlib.image.transformation import resize_image
 
-from youtube2zim.constants import CHANNEL, PLAYLIST, USER, YOUTUBE, logger
+from youtube2zim.constants import YOUTUBE, logger
 from youtube2zim.utils import get_slug, load_json, save_json
 
 YOUTUBE_API = "https://www.googleapis.com/youtube/v3"
@@ -23,6 +23,18 @@
 REQUEST_TIMEOUT = 60
 
 
+class ChannelNotFoundError(Exception):
+    """Exception raise when requested channel is not found"""
+
+    pass
+
+
+class PlaylistNotFoundError(Exception):
+    """Exception raise when requested playlist is not found"""
+
+    pass
+
+
 class Playlist:
     def __init__(
         self,
@@ -104,8 +116,9 @@ def get_channel_json(channel_id):
                 logger.warning(f"Failed to find {channel_id} by {criteria}")
                 continue
             channel_json = req_json["items"][0]
+            break
         if channel_json is None:
-            raise Exception(f"Impossible to find {channel_id}, check for typos")
+            raise ChannelNotFoundError(f"Invalid channel ID `{channel_id}`: Not Found")
         save_json(YOUTUBE.cache_dir, fname, channel_json)
     return channel_json
 
@@ -164,8 +177,9 @@ def get_playlist_json(playlist_id):
         try:
             playlist_json = req.json()["items"][0]
         except IndexError:
-            logger.error(f"Invalid playlistId `{playlist_id}`: Not Found")
-            raise
+            raise PlaylistNotFoundError(
+                f"Invalid playlistId `{playlist_id}`: Not Found"
+            ) from None
         save_json(YOUTUBE.cache_dir, fname, playlist_json)
     return playlist_json
 
@@ -319,34 +333,43 @@ def skip_outofrange_videos(date_range, item):
     return dt_parser.parse(item["snippet"]["publishedAt"]).date() in date_range
 
 
-def extract_playlists_details_from(collection_type, youtube_id):
-    """prepare a list of Playlist from user request
-
-    USER: we fetch the hidden channel associate to it
-    CHANNEL (and USER): we grab all playlists + `uploads` playlist
-    PLAYLIST: we retrieve from the playlist Id(s)"""
+def extract_playlists_details_from(youtube_id: str):
+    """prepare a list of Playlist from user request"""
 
     uploads_playlist_id = None
     main_channel_id = None
-    if collection_type in (USER, CHANNEL):
-        # get_channel_json is capable to retrieve user and channel
-        channel_json = get_channel_json(youtube_id)
-        main_channel_id = channel_json["id"]
-
-        # retrieve list of playlists for that channel
-        playlist_ids = [p["id"] for p in get_channel_playlists_json(main_channel_id)]
-        # we always include uploads playlist (contains everything)
-        playlist_ids += [channel_json["contentDetails"]["relatedPlaylists"]["uploads"]]
-        uploads_playlist_id = playlist_ids[-1]
-    elif collection_type == PLAYLIST:
+    if "," not in youtube_id:
+        try:
+            # first try to consider passed ID is a channel ID (or username or handle)
+            channel_json = get_channel_json(youtube_id)
+            main_channel_id = channel_json["id"]
+            # retrieve list of playlists for that channel
+            playlist_ids = [
+                p["id"] for p in get_channel_playlists_json(main_channel_id)
+            ]
+            # we always include uploads playlist (contains everything)
+            playlist_ids += [
+                channel_json["contentDetails"]["relatedPlaylists"]["uploads"]
+            ]
+            uploads_playlist_id = playlist_ids[-1]
+            is_playlist = False
+        except ChannelNotFoundError:
+            # channel not found, then ID should be a playlist
+            playlist_ids = [youtube_id]
+            main_channel_id = Playlist.from_id(youtube_id).creator_id
+            is_playlist = True
+    else:
+        # only playlists are supported in CSV ; let's grab all playlists info
+        # (intentionally, to check they are all ok) and use channel of first playlist as
+        # main channel ID
         playlist_ids = youtube_id.split(",")
         main_channel_id = Playlist.from_id(playlist_ids[0]).creator_id
-    else:
-        raise NotImplementedError("unsupported collection_type")
+        is_playlist = True
 
     return (
         # dict.fromkeys maintains the order of playlist_ids while removing duplicates
         [Playlist.from_id(playlist_id) for playlist_id in dict.fromkeys(playlist_ids)],
         main_channel_id,
         uploads_playlist_id,
+        is_playlist,
     )
diff --git a/scraper/tests-integration/integration.py b/scraper/tests-integration/integration.py
index bf6ca6ad..8241a565 100644
--- a/scraper/tests-integration/integration.py
+++ b/scraper/tests-integration/integration.py
@@ -47,7 +47,6 @@ def test_zim_channel_json():
 
     assert channel_json["id"] == "UC8elThf5TGMpQfQc_VE917Q"
     assert channel_json["channelName"] == "openZIM_testing"
-    assert channel_json["collectionType"] == "channel"
     assert channel_json["mainPlaylist"] == "uploads_from_openzim_testing-917Q"
 
 
diff --git a/zimui/cypress/fixtures/channel/channel.json b/zimui/cypress/fixtures/channel/channel.json
index daf09514..19696e8b 100644
--- a/zimui/cypress/fixtures/channel/channel.json
+++ b/zimui/cypress/fixtures/channel/channel.json
@@ -7,6 +7,5 @@
   "profilePath": "profile.jpg",
   "bannerPath": "banner.jpg",
   "joinedDate": "2024-06-04T13:30:16.232286Z",
-  "collectionType": "channel",
   "mainPlaylist": "uploads_from_openzim_testing-917Q"
 }
diff --git a/zimui/src/types/Channel.ts b/zimui/src/types/Channel.ts
index 4cfa8a8d..21039f27 100644
--- a/zimui/src/types/Channel.ts
+++ b/zimui/src/types/Channel.ts
@@ -7,7 +7,6 @@ export interface Channel {
   profilePath?: string
   bannerPath?: string
   joinedDate: string
-  collectionType: string
   mainPlaylist?: string
   playlistCount: number
 }
@@ -25,9 +24,3 @@ export interface Author {
   profilePath?: string
   bannerPath?: string
 }
-
-export enum CollectionType {
-  Playlist = 'playlist',
-  Video = 'video',
-  Channel = 'channel'
-}