Skip to content

Commit

Permalink
Merge pull request #363 from openzim/filter_private_vids
Browse files Browse the repository at this point in the history
Filter-out non-public videos and properly cleanup unsuccessful videos
  • Loading branch information
benoit74 authored Oct 17, 2024
2 parents 46c1d3c + 326187d commit f20ac6d
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 5 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Raise exception if there are no videos in the playlists (#347)

### Fixed

- Filter-out non-public videos and properly cleanup unsuccessful videos (#362)

## [3.2.0] - 2024-10-11

### Deprecated
Expand Down
9 changes: 5 additions & 4 deletions scraper/src/youtube2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
get_videos_json,
save_channel_branding,
skip_deleted_videos,
skip_non_public_videos,
skip_outofrange_videos,
)

Expand Down Expand Up @@ -611,6 +612,7 @@ def extract_videos_list(self):
)
filter_videos = filter(skip_outofrange, videos_json)
filter_videos = filter(skip_deleted_videos, filter_videos)
filter_videos = filter(skip_non_public_videos, filter_videos)
all_videos.update(
{v["contentDetails"]["videoId"]: v for v in filter_videos}
)
Expand Down Expand Up @@ -1038,10 +1040,9 @@ def update_metadata(self):
def make_json_files(self, actual_videos_ids):
"""Generate JSON files to be consumed by the frontend"""

def remove_unused_videos(videos):
video_ids = [video["contentDetails"]["videoId"] for video in videos]
def remove_unused_videos():
for path in self.videos_dir.iterdir():
if path.is_dir() and path.name not in video_ids:
if path.is_dir() and path.name not in actual_videos_ids:
logger.debug(f"Removing unused video {path.name}")
shutil.rmtree(path, ignore_errors=True)

Expand Down Expand Up @@ -1282,7 +1283,7 @@ def get_playlist_slug(playlist) -> str:
)

# clean videos left out in videos directory
remove_unused_videos(videos)
remove_unused_videos()

def add_file_to_zim(
self,
Expand Down
7 changes: 6 additions & 1 deletion scraper/src/youtube2zim/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def get_videos_json(playlist_id):
PLAYLIST_ITEMS_API,
params={
"playlistId": playlist_id,
"part": "snippet,contentDetails",
"part": "snippet,contentDetails,status",
"key": YOUTUBE.api_key,
"maxResults": RESULTS_PER_PAGE,
"pageToken": page_token,
Expand Down Expand Up @@ -309,6 +309,11 @@ def skip_deleted_videos(item):
)


def skip_non_public_videos(item):
"""filter func to filter-out non-public videos"""
return item["status"]["privacyStatus"] == "public"


def skip_outofrange_videos(date_range, item):
"""filter func to filter-out videos that are not within specified date range"""
return dt_parser.parse(item["snippet"]["publishedAt"]).date() in date_range
Expand Down

0 comments on commit f20ac6d

Please sign in to comment.