Skip to content

Commit

Permalink
Merge pull request #489 from aliparlakci/development
Browse files Browse the repository at this point in the history
Update to v2.3.0
  • Loading branch information
Serene-Arc authored Jul 17, 2021
2 parents 349abbf + 2f8ca76 commit a2f00c7
Show file tree
Hide file tree
Showing 44 changed files with 489 additions and 179 deletions.
9 changes: 9 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[submodule "scripts/tests/bats"]
path = scripts/tests/bats
url = https://github.com/bats-core/bats-core.git
[submodule "scripts/tests/test_helper/bats-assert"]
path = scripts/tests/test_helper/bats-assert
url = https://github.com/bats-core/bats-assert.git
[submodule "scripts/tests/test_helper/bats-support"]
path = scripts/tests/test_helper/bats-support
url = https://github.com/bats-core/bats-support.git
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,9 @@ The following options are for the `archive` command specifically.
- `json` (default)
- `xml`
- `yaml`
- `--comment-context`
- This option will, instead of downloading an individual comment, download the submission that comment is a part of
- May result in a longer run time as it retrieves much more data

### Cloner Options

Expand Down
1 change: 1 addition & 0 deletions bdfr/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@

_archiver_options = [
click.option('--all-comments', is_flag=True, default=None),
click.option('--comment-context', is_flag=True, default=None),
click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None),
]

Expand Down
1 change: 1 addition & 0 deletions bdfr/archive_entry/base_archive_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def _convert_comment_to_dict(in_comment: Comment) -> dict:
'id': in_comment.id,
'score': in_comment.score,
'subreddit': in_comment.subreddit.display_name,
'author_flair': in_comment.author_flair_text,
'submission': in_comment.submission.id,
'stickied': in_comment.stickied,
'body': in_comment.body,
Expand Down
3 changes: 3 additions & 0 deletions bdfr/archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ def _pull_lever_entry_factory(praw_item: (praw.models.Submission, praw.models.Co
raise ArchiverError(f'Factory failed to classify item of type {type(praw_item).__name__}')

def write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)):
if self.args.comment_context and isinstance(praw_item, praw.models.Comment):
logger.debug(f'Converting comment {praw_item.id} to submission {praw_item.submission.id}')
praw_item = praw_item.submission
archive_entry = self._pull_lever_entry_factory(praw_item)
if self.args.format == 'json':
self._write_entry_json(archive_entry)
Expand Down
3 changes: 2 additions & 1 deletion bdfr/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,9 @@ def __init__(self):
self.verbose: int = 0

# Archiver-specific options
self.format = 'json'
self.all_comments = False
self.format = 'json'
self.comment_context: bool = False

def process_click_arguments(self, context: click.Context):
for arg_key in context.params.keys():
Expand Down
16 changes: 8 additions & 8 deletions bdfr/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,11 @@ def _setup_internal_objects(self):
def read_config(self):
"""Read any cfg values that need to be processed"""
if self.args.max_wait_time is None:
if not self.cfg_parser.has_option('DEFAULT', 'max_wait_time'):
self.cfg_parser.set('DEFAULT', 'max_wait_time', '120')
logger.log(9, 'Wrote default download wait time download to config file')
self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time')
self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time', fallback=120)
logger.debug(f'Setting maximum download wait time to {self.args.max_wait_time} seconds')
if self.args.time_format is None:
option = self.cfg_parser.get('DEFAULT', 'time_format', fallback='ISO')
if re.match(r'^[ \'\"]*$', option):
if re.match(r'^[\s\'\"]*$', option):
option = 'ISO'
logger.debug(f'Setting datetime format string to {option}')
self.args.time_format = option
Expand All @@ -119,7 +116,7 @@ def create_reddit_instance(self):
logger.debug('Using authenticated Reddit instance')
if not self.cfg_parser.has_option('DEFAULT', 'user_token'):
logger.log(9, 'Commencing OAuth2 authentication')
scopes = self.cfg_parser.get('DEFAULT', 'scopes')
scopes = self.cfg_parser.get('DEFAULT', 'scopes', fallback='identity, history, read, save')
scopes = OAuth2Authenticator.split_scopes(scopes)
oauth2_authenticator = OAuth2Authenticator(
scopes,
Expand Down Expand Up @@ -210,7 +207,7 @@ def create_file_logger(self):
if log_path.exists():
try:
file_handler.doRollover()
except PermissionError as e:
except PermissionError:
logger.critical(
'Cannot rollover logfile, make sure this is the only '
'BDFR process or specify alternate logfile location')
Expand Down Expand Up @@ -242,6 +239,9 @@ def get_subreddits(self) -> list[praw.models.ListingGenerator]:
if self.args.subreddit:
out = []
for reddit in self.split_args_input(self.args.subreddit):
if reddit == 'friends' and self.authenticated is False:
logger.error('Cannot read friends subreddit without an authenticated instance')
continue
try:
reddit = self.reddit_instance.subreddit(reddit)
try:
Expand Down Expand Up @@ -394,7 +394,7 @@ def download(self):

@staticmethod
def check_subreddit_status(subreddit: praw.models.Subreddit):
if subreddit.display_name == 'all':
if subreddit.display_name in ('all', 'friends'):
return
try:
assert subreddit.id
Expand Down
8 changes: 6 additions & 2 deletions bdfr/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ def _download_submission(self, submission: praw.models.Submission):
elif not isinstance(submission, praw.models.Submission):
logger.warning(f'{submission.id} is not a submission')
return
elif not self.download_filter.check_url(submission.url):
logger.debug(f'Submission {submission.id} filtered due to URL {submission.url}')
return

logger.debug(f'Attempting to download submission {submission.id}')
try:
Expand All @@ -76,7 +79,7 @@ def _download_submission(self, submission: praw.models.Submission):
logger.debug(f'File {destination} from submission {submission.id} already exists, continuing')
continue
elif not self.download_filter.check_resource(res):
logger.debug(f'Download filter removed {submission.id} with URL {submission.url}')
logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}')
continue
try:
res.download(self.args.max_wait_time)
Expand All @@ -103,7 +106,8 @@ def _download_submission(self, submission: praw.models.Submission):
logger.debug(f'Written file to {destination}')
except OSError as e:
logger.exception(e)
logger.error(f'Failed to write file to {destination} in submission {submission.id}: {e}')
logger.error(f'Failed to write file in submission {submission.id} to {destination}: {e}')
return
creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple())
os.utime(destination, (creation_time, creation_time))
self.master_hash_list[resource_hash] = destination
Expand Down
3 changes: 3 additions & 0 deletions bdfr/site_downloaders/download_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from bdfr.site_downloaders.gallery import Gallery
from bdfr.site_downloaders.gfycat import Gfycat
from bdfr.site_downloaders.imgur import Imgur
from bdfr.site_downloaders.pornhub import PornHub
from bdfr.site_downloaders.redgifs import Redgifs
from bdfr.site_downloaders.self_post import SelfPost
from bdfr.site_downloaders.youtube import Youtube
Expand Down Expand Up @@ -43,6 +44,8 @@ def pull_lever(url: str) -> Type[BaseDownloader]:
return Youtube
elif re.match(r'i\.redd\.it.*', sanitised_url):
return Direct
elif re.match(r'pornhub\.com.*', sanitised_url):
return PornHub
elif YoutubeDlFallback.can_handle_link(sanitised_url):
return YoutubeDlFallback
else:
Expand Down
40 changes: 24 additions & 16 deletions bdfr/site_downloaders/gallery.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
#!/usr/bin/env python3

import logging
import re
from typing import Optional

import bs4
import requests
from praw.models import Submission

from bdfr.exceptions import SiteDownloaderError
Expand All @@ -20,21 +19,30 @@ def __init__(self, post: Submission):
super().__init__(post)

def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
image_urls = self._get_links(self.post.url)
try:
image_urls = self._get_links(self.post.gallery_data['items'])
except AttributeError:
try:
image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items'])
except (AttributeError, IndexError, TypeError):
logger.error(f'Could not find gallery data in submission {self.post.id}')
logger.exception('Gallery image find failure')
raise SiteDownloaderError('No images found in Reddit gallery')

if not image_urls:
raise SiteDownloaderError('No images found in Reddit gallery')
return [Resource(self.post, url) for url in image_urls]

@staticmethod
def _get_links(url: str) -> list[str]:
resource_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
page = Gallery.retrieve_url(url, headers=resource_headers)
soup = bs4.BeautifulSoup(page.text, 'html.parser')

links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')})
links = [link.get('href') for link in links]
return links
@ staticmethod
def _get_links(id_dict: list[dict]) -> list[str]:
out = []
for item in id_dict:
image_id = item['media_id']
possible_extensions = ('.jpg', '.png', '.gif', '.gifv', '.jpeg')
for extension in possible_extensions:
test_url = f'https://i.redd.it/{image_id}{extension}'
response = requests.head(test_url)
if response.status_code == 200:
out.append(test_url)
break
return out
5 changes: 3 additions & 2 deletions bdfr/site_downloaders/imgur.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ def _compute_image_url(self, image: dict) -> Resource:

@staticmethod
def _get_data(link: str) -> dict:
if re.match(r'.*\.gifv$', link):
link = link.rstrip('?')
if re.match(r'(?i).*\.gifv$', link):
link = link.replace('i.imgur', 'imgur')
link = link.rstrip('.gifv')
link = re.sub('(?i)\\.gifv$', '', link)

res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'})

Expand Down
26 changes: 26 additions & 0 deletions bdfr/site_downloaders/pornhub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env python3
# coding=utf-8

import logging
from typing import Optional

from praw.models import Submission

from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.youtube import Youtube

logger = logging.getLogger(__name__)


class PornHub(Youtube):
def __init__(self, post: Submission):
super().__init__(post)

def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
ytdl_options = {
'format': 'best',
'nooverwrites': True,
}
out = self._download_video(ytdl_options)
return [out]
1 change: 0 additions & 1 deletion bdfr/site_downloaders/redgifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import re
from typing import Optional

from bs4 import BeautifulSoup
from praw.models import Submission

from bdfr.exceptions import SiteDownloaderError
Expand Down
1 change: 0 additions & 1 deletion bdfr/site_downloaders/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def _download_video(self, ytdl_options: dict) -> Resource:
except youtube_dl.DownloadError as e:
raise SiteDownloaderError(f'Youtube download failed: {e}')

downloaded_file = None
downloaded_files = list(download_path.iterdir())
if len(downloaded_files) > 0:
downloaded_file = downloaded_files[0]
Expand Down
5 changes: 3 additions & 2 deletions scripts/extract_failed_ids.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ if [ -n "$2" ]; then
output="$2"
echo "Outputting IDs to $output"
else
output="failed.txt"
output="./failed.txt"
fi

{
grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ;
grep 'Failed to download resource' "$file" | awk '{ print $15 }' ;
grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ;
grep 'Failed to write file' "$file" | awk '{ print $16 }' | rev | cut -c 2- | rev ;
grep 'Failed to write file' "$file" | awk '{ print $13 }' | rev | cut -c 2- | rev ;
grep 'skipped due to disabled module' "$file" | awk '{ print $9 }' ;
} >>"$output"
2 changes: 1 addition & 1 deletion scripts/extract_successful_ids.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ if [ -n "$2" ]; then
output="$2"
echo "Outputting IDs to $output"
else
output="successful.txt"
output="./successful.txt"
fi

{
Expand Down
13 changes: 13 additions & 0 deletions scripts/tests/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Bash Scripts Testing

The `bats` framework is included and used to test the scripts included, specifically the scripts designed to parse through the logging output. As this involves delicate regex and indexes, it is necessary to test these.

## Running Tests

Running the tests are easy, and can be done with a single command. Once the working directory is this directory, run the following command.

```bash
./bats/bin/bats *.bats
```

This will run all test files that have the `.bats` suffix.
1 change: 1 addition & 0 deletions scripts/tests/bats
Submodule bats added at ce5ca2
1 change: 1 addition & 0 deletions scripts/tests/example_logfiles/failed_disabled_module.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[2021-06-12 12:49:18,452 - bdfr.downloader - DEBUG] - Submission m2601g skipped due to disabled module Direct
3 changes: 3 additions & 0 deletions scripts/tests/example_logfiles/failed_no_downloader.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[2021-06-12 11:13:35,665 - bdfr.downloader - ERROR] - Could not download submission nxv3ew: No downloader module exists for url https://www.biorxiv.org/content/10.1101/2021.06.11.447961v1?rss=1
[2021-06-12 11:14:21,958 - bdfr.downloader - ERROR] - Could not download submission nxv3ek: No downloader module exists for url https://alkossegyedit.hu/termek/pluss-macko-poloval-20cm/?feed_id=34832&_unique_id=60c40a1190ccb&utm_source=Reddit&utm_medium=AEAdmin&utm_campaign=Poster
[2021-06-12 11:17:53,456 - bdfr.downloader - ERROR] - Could not download submission nxv3ea: No downloader module exists for url https://www.biorxiv.org/content/10.1101/2021.06.11.448067v1?rss=1
2 changes: 2 additions & 0 deletions scripts/tests/example_logfiles/failed_resource_error.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[2021-06-12 11:18:25,794 - bdfr.downloader - ERROR] - Failed to download resource https://i.redd.it/61fniokpjq471.jpg in submission nxv3dt with downloader Direct: Unrecoverable error requesting resource: HTTP Code 404

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[2021-06-12 08:38:35,657 - bdfr.downloader - ERROR] - Site Gallery failed to download submission nxr7x9: No images found in Reddit gallery
[2021-06-12 08:47:22,005 - bdfr.downloader - ERROR] - Site Gallery failed to download submission nxpn0h: Server responded with 503 to https://www.reddit.com/gallery/nxpkvh
1 change: 1 addition & 0 deletions scripts/tests/example_logfiles/failed_write_error.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[2021-06-09 22:01:04,530 - bdfr.downloader - ERROR] - Failed to write file in submission nnboza to C:\Users\Yoga 14\path\to\output\ThotNetwork\KatieCarmine_I POST A NEW VIDEO ALMOST EVERYDAY AND YOU NEVER HAVE TO PAY EXTRA FOR IT! I want to share my sex life with you! Only $6 per month and you get full access to over 400 videos of me getting fuck_nnboza.mp4: [Errno 2] No such file or directory: 'C:\\Users\\Yoga 14\\path\\to\\output\\ThotNetwork\\KatieCarmine_I POST A NEW VIDEO ALMOST EVERYDAY AND YOU NEVER HAVE TO PAY EXTRA FOR IT! I want to share my sex life with you! Only $6 per month and you get full access to over 400 videos of me getting fuck_nnboza.mp4'
3 changes: 3 additions & 0 deletions scripts/tests/example_logfiles/succeed_already_exists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[2021-06-12 08:41:51,464 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxry0l.jpg from submission nxry0l already exists, continuing
[2021-06-12 08:41:51,469 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxrlgn.gif from submission nxrlgn already exists, continuing
[2021-06-12 08:41:51,472 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxrq9g.png from submission nxrq9g already exists, continuing
3 changes: 3 additions & 0 deletions scripts/tests/example_logfiles/succeed_download_filter.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[2021-06-10 20:36:48,722 - bdfr.downloader - DEBUG] - Download filter removed nwfirr with URL https://www.youtube.com/watch?v=NVSiX0Tsees
[2021-06-12 19:56:36,848 - bdfr.downloader - DEBUG] - Download filter removed nwfgcl with URL https://www.reddit.com/r/MaliciousCompliance/comments/nwfgcl/new_guy_decided_to_play_manager_alright/
[2021-06-12 19:56:28,587 - bdfr.downloader - DEBUG] - Download filter removed nxuxjy with URL https://www.reddit.com/r/MaliciousCompliance/comments/nxuxjy/you_want_an_omelette_with_nothing_inside_okay/
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[2021-06-12 11:58:53,864 - bdfr.downloader - INFO] - Downloaded submission nxui9y from tumblr
[2021-06-12 11:58:56,618 - bdfr.downloader - INFO] - Downloaded submission nxsr4r from tumblr
[2021-06-12 11:58:59,026 - bdfr.downloader - INFO] - Downloaded submission nxviir from tumblr
[2021-06-12 11:59:00,289 - bdfr.downloader - INFO] - Downloaded submission nxusva from tumblr
[2021-06-12 11:59:00,735 - bdfr.downloader - INFO] - Downloaded submission nxvko7 from tumblr
[2021-06-12 11:59:01,215 - bdfr.downloader - INFO] - Downloaded submission nxvd63 from tumblr
[2021-06-12 11:59:13,891 - bdfr.downloader - INFO] - Downloaded submission nn9cor from tumblr
1 change: 1 addition & 0 deletions scripts/tests/example_logfiles/succeed_hard_link.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[2021-06-11 17:33:02,118 - bdfr.downloader - INFO] - Hard link made linking /media/smaug/private/reddit/tumblr/nwnp2n.jpg to /media/smaug/private/reddit/tumblr/nwskqb.jpg in submission nwnp2n
1 change: 1 addition & 0 deletions scripts/tests/example_logfiles/succeed_resource_hash.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[2021-06-11 17:33:02,118 - bdfr.downloader - INFO] - Resource hash aaaaaaaaaaaaaaaaaaaaaaa from submission n86jk8 downloaded elsewhere
43 changes: 43 additions & 0 deletions scripts/tests/test_extract_failed_ids.bats
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
setup() {
load ./test_helper/bats-support/load
load ./test_helper/bats-assert/load
}

teardown() {
rm -f failed.txt
}

@test "fail run no logfile" {
run ../extract_failed_ids.sh
assert_failure
}

@test "fail no downloader module" {
run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "3" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}

@test "fail resource error" {
run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}

@test "fail site downloader error" {
run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "2" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}

@test "fail failed file write" {
run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}

@test "fail disabled module" {
run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}
Loading

0 comments on commit a2f00c7

Please sign in to comment.