Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add web archive as a final fallback [WIP] #40

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pub struct Config
pub NHENTAI_TAGS: Option<Vec<String>>, // keep creating downloadme.txt from these tags and keep downloading (server mode), normal tags are in format "tag:{tag}" for example "tag:ffm-threesome"; if None: don't generate downloadme.txt, download hentai once (client mode)
pub SLEEP_INTERVAL: Option<u64>, // sleep interval in seconds between checking for new hentai to download (server mode)
pub USER_AGENT: Option<String>, // bypass bot protection
pub ARCHIVE_ORG: Option<bool>, // Allow pull from archive.org? False by default
shinji257 marked this conversation as resolved.
Show resolved Hide resolved
}

impl Default for Config
Expand All @@ -39,6 +40,7 @@ impl Default for Config
NHENTAI_TAGS: None,
SLEEP_INTERVAL: Some(50000),
USER_AGENT: Some("".to_owned()),
ARCHIVE_ORG: None,
}
}
}
105 changes: 104 additions & 1 deletion src/hentai.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,11 @@ impl Hentai
/// # Arguments
/// - `http_client`: reqwest http client
/// - `db`: database connectionc
/// - `webarchive`: Download from web archive? False by default.
///
/// # Returns
/// - nothing or error
pub async fn download(&self, http_client: &reqwest::Client, cleanup_temporary_files: bool) -> Result<(), HentaiDownloadError>
pub async fn download(&self, http_client: &reqwest::Client, cleanup_temporary_files: bool, webarchive: bool) -> Result<(), HentaiDownloadError>
{
const WORKERS: usize = 5; // number of parallel workers
let cbz_final_filepath: String; //filepath to final cbz in library
Expand Down Expand Up @@ -218,6 +219,44 @@ impl Hentai
}
if image_download_success {break;} // if all images were downloaded successfully: continue with cbz creation
}
if !image_download_success && webarchive == true { // Web Archive Loop
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't want to download images from web archive, only the metadata as the images should not have been purged from the nhentai media servers (see famously 177013, last section in my readme). Correct me if I'm wrong.

Then hypothetically speaking, if we wanted to download images from the web archive: Completely duplicating the outer download logic is likely unnecessary. If I see this correctly, the only thing that's different is that you call a different download image function Self::archive_image. Why didn't you just implement fallback to web archive logic in the original download_image function?

image_download_success = true; // assume success
handles = Vec::new(); // reset handles

for i in 0..self.images_url.len() // for each page
{
let f_clone: scaler::Formatter = f.clone();
let http_client_clone: reqwest::Client = http_client.clone();
let image_filepath: String = format!("{}{}/{}", self.library_path, self.id, self.images_filename.get(i).expect("Index out of bounds even though should have same size as images_url."));
let image_url_clone: String = self.images_url.get(i).expect("Index out of bounds even though checked before that it fits.").clone();
let num_pages_clone: u16 = self.num_pages;

let permit: tokio::sync::OwnedSemaphorePermit = worker_sem.clone().acquire_owned().await.expect("Something closed semaphore even though it should never be closed."); // acquire semaphore
handles.push(tokio::spawn(async move
{
let result: Option<()>;
match Self::archive_image(&http_client_clone, &image_url_clone, &image_filepath).await // download image
{
Ok(_) =>
{
log::debug!("Downloaded hentai image {} / {}.", f_clone.format((i+1) as f64), f_clone.format(num_pages_clone as f64));
result = Some(()); // success
}
Err(e) =>
{
log::warn!("{e}");
result = None; // failure
}
}
drop(permit); // release semaphore
result // return result into handle
})); // search all pages in parallel
}
for handle in handles
{
if handle.await.unwrap().is_none() {image_download_success = false;} // collect results, forward panics, abort so we don't needlessly spam IA on a set that won't download
}
}
if !image_download_success {return Err(HentaiDownloadError::Download {})}; // if after 5 attempts still not all images downloaded successfully: give up
log::info!("Downloaded hentai images.");

Expand Down Expand Up @@ -331,6 +370,7 @@ impl Hentai


let mut r: reqwest::Response = http_client.get(image_url).send().await?; // tag search on general media server, page

if r.status() != reqwest::StatusCode::OK // if status not ok: retry with other media servers
{
for media_server in MEDIA_SERVERS // try all media servers
Expand All @@ -341,6 +381,69 @@ impl Hentai
if r.status() == reqwest::StatusCode::OK {break;} // if not ok: try again
}
}

if r.status() != reqwest::StatusCode::OK {return Err(HentaiDownloadImageError::ReqwestStatus {url: image_url.to_owned(), status: r.status()});} // if status still not ok: something went wrong

let mut file: tokio::fs::File;
#[cfg(target_family = "unix")]
{
if let Some(parent) = std::path::Path::new(image_filepath).parent() // create all parent directories with permissions "drwxrwxrwx"
{
if let Err(e) = tokio::fs::DirBuilder::new().recursive(true).mode(0o777).create(parent).await
{
return Err(HentaiDownloadImageError::StdIo {filepath: image_filepath.to_owned(), source: e});
}
}
match tokio::fs::OpenOptions::new().create_new(true).mode(0o666).write(true).open(image_filepath).await
{
Ok(o) => file = o,
Err(e) => {return Err(HentaiDownloadImageError::StdIo {filepath: image_filepath.to_owned(), source: e});}
}
}
#[cfg(not(target_family = "unix"))]
{
if let Some(parent) = std::path::Path::new(image_filepath).parent() // create all parent directories
{
if let Err(e) = tokio::fs::DirBuilder::new().recursive(true).create(parent).await
{
return Err(HentaiDownloadImageError::StdIo {filepath: image_filepath.to_owned(), source: e});
}
}
match tokio::fs::OpenOptions::new().create_new(true).write(true).open(image_filepath).await
{
Ok(o) => file = o,
Err(e) => {return Err(HentaiDownloadImageError::StdIo {filepath: image_filepath.to_owned(), source: e});}
}
}

if let Err(e) = file.write_all_buf(&mut r.bytes().await?).await // save image with permissions "rw-rw-rw-"
{
return Err(HentaiDownloadImageError::StdIo {filepath: image_filepath.to_owned(), source: e});
}

return Ok(());
}

async fn archive_image(http_client: &reqwest::Client, image_url: &str, image_filepath: &str) -> Result<(), HentaiDownloadImageError>
{

if let Ok(o) = tokio::fs::metadata(image_filepath).await
{
if o.is_file() {return Ok(());} // if image already exists: skip download
if o.is_dir() {return Err(HentaiDownloadImageError::BlockedByDirectory {directory_path: image_filepath.to_owned()});} // if image filepath blocked by directory: give up
}


let mut r: reqwest::Response = http_client.get(image_url).send().await?; // tag search on general media server, page

if r.status() != reqwest::StatusCode::OK // if status not ok: retry with other media servers
{
log::warn!("Pulling from the Internet Archive: {image_url}");
log::debug!("{}", image_url.replace("https://i.nhentai.net", format!("https://web.archive.org/web/00000000000000if_/https://i.nhentai.net").as_str()));
r = http_client.get(image_url.replace("https://i.nhentai.net", format!("https://web.archive.org/web/00000000000000if_/https://i.nhentai.net").as_str())).send().await?; // tag search, page, insert media server
log::debug!("{}", r.status());
}

if r.status() != reqwest::StatusCode::OK {return Err(HentaiDownloadImageError::ReqwestStatus {url: image_url.to_owned(), status: r.status()});} // if status still not ok: something went wrong

let mut file: tokio::fs::File;
Expand Down
2 changes: 1 addition & 1 deletion src/main_inner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ pub async fn main_inner(config: Config) -> Result<(), Error>
}
}

if let Err(e) = hentai.download(&http_client, config.CLEANUP_TEMPORARY_FILES.unwrap_or(true)).await
if let Err(e) = hentai.download(&http_client, config.CLEANUP_TEMPORARY_FILES.unwrap_or(true), config.ARCHIVE_ORG.unwrap_or(false)).await
{
log::error!{"{e}"};
}
Expand Down