From 9c3c601df1e7a589df8305583197ac8ba85e2b35 Mon Sep 17 00:00:00 2001 From: Nocturnal Nebula Date: Thu, 28 Dec 2023 13:09:42 -0300 Subject: [PATCH] Add files via upload --- litstash.py | 307 ++++++++++++++++++++++++++++++++++----------- readMe.txt | 21 ++-- versionChanges.txt | 9 ++ 3 files changed, 255 insertions(+), 82 deletions(-) diff --git a/litstash.py b/litstash.py index 16cc5c1..d822c31 100644 --- a/litstash.py +++ b/litstash.py @@ -2,7 +2,7 @@ # LICENSE INFORMATION: GNU GPLv3.0-or-later -# litstash is a program for downloading submissions from Literotica +# litstash is a downloader of submissions from Literotica or xnxx stories # Copyright (C) 2023 NocturnalNebula @@ -35,10 +35,11 @@ # initialize global variables origCwd = os.getcwd() downloadList = [] -version = 'Version: 1.3' -updated = 'Updated: September 2023' +version = 'Version: 1.4' +updated = 'Updated: December 2023' usage = ''' -litstash can download any submission from Literotica or from Wayback Machine captures +litstash is a story downloader with support for the sites Literotica and xnxx, +including Wayback Machine captures of either site usage: litstash [option] @@ -52,13 +53,15 @@ options: -h, --help print usage guide --version print version number + -a automatically download all detected submissions URL: - URLs should be surrounded by "quotes" to prevent an '&' from confusing the shell - - URLs are any Literotica submission (story, poem, audio, illustration) or - any page that contains links to Literotica submissions (author submission - pages, favorite lists, etc...) - - literotica submissions can be from literotica.com or the Wayback Machine + - URLs are any of the following: + - Literotica submission (story, poem, audio, illustration) + - xnxx story (sexstories.com) + - any page containing links to either of the above (e.g. author pages) + - Wayback Machine capture of any of the above - multiple URLs can be included examples: (replace 'litstash' with 'python litstash.py' if running with Python) @@ -66,13 +69,12 @@ litstash https://www.literotica.com/s/an-erotic-story-9 https://www.literotica.com/p/a-smutty-poem-8 litstash "https://www.literotica.com/stories/memberpage.php?uid=0000000&page=submissions" litstash "https://web.archive.org/web/20130919123456/https://www.literotica.com/s/a-deleted-story-4" - litstash "https://web.archive.org/web/20130723123456/https://www.literotica.com/stories/memberpage.php?uid=0000000&page=submissions" + litstash -a "https://web.archive.org/web/20130723123456/https://www.literotica.com/stories/memberpage.php?uid=0000000&page=submissions" more: - - downloads will be saved in 'lit-submissions' created in the current working directory + - downloads will be saved in 'litstash-saves' created in the current working directory ''' - # CLASSES class literotica: @@ -81,6 +83,7 @@ def __init__(self, url): # immediately defined self.url = url self.slug = self.url.split('/')[-1] + self.publisher = "Literotica" self.apiData = {} # dict from rest api self.pageSource = '' @@ -88,6 +91,7 @@ def __init__(self, url): # submission metadata self.title = '' self.username = '' + self.series = '' self.date = '' self.description = '' self.category = '' @@ -128,7 +132,8 @@ def getApiData(self): def download(self): # collect all data needed to export the submission - print(f"\nDownloading {cleanTitle(self.url)}") + print(f"\n[Downloading] {cleanTitle(self.url)} ({getSite(self.url)})") + print(f"[URL] {self.url}") # loop through each page of the story, scraping the text of each page while self.currentPage <= self.pages: @@ -154,6 +159,12 @@ def download(self): self.description = self.apiData['submission']['description'] self.category = getCategory(self.apiData['submission']['category_info']['pageUrl']) + # get series data only if it exists + try: + self.series = self.apiData['submission']['series']['meta']['title'] + except TypeError: + pass + # need to get real page source for illusration categories if self.category == 'Adult Comics' or self.category == 'Erotic Art': # get page source to gather pageText, send it into getImages @@ -175,7 +186,7 @@ def download(self): # append current page text to master text string self.text += f"

{pageText}

" - print(f"[Saved] Page {self.currentPage} of {self.pages}") + print(f"[Saved] Page {self.currentPage} of {self.pages}") self.currentPage += 1 self.wordCount = wordCount(self.text) @@ -193,13 +204,15 @@ def build(self): def export(self): export(self.path, self.filename, self.output) -class waybackMachine: +class waybackMachineLit: def __init__(self, url): # immediately defined self.url = url self.slug = self.url.split('/')[-1] self.authorPageUrl = '' + self.publisher = "Literotica" + self.pageSource = '' @@ -207,6 +220,7 @@ def __init__(self, url): self.scheme = '' # Modern / Classic / Pre-Classic / Original self.title = '' self.username = '' + self.series = '' self.date = '' self.description = '' self.category = '' @@ -231,6 +245,7 @@ def getDateFromAuthor(self): if self.authorPageUrl == '': print('Cannot find author page URL.') print('Try a different Wayback Machine capture to include the upload date.') + self.date = '0000-00-00' return if self.authorPageUrl.startswith('//'): @@ -242,6 +257,7 @@ def getDateFromAuthor(self): if 'This member does not exists' in authorPageSource: print('No capture available of author page.') print('Try a different Wayback Machine capture to include the upload date.') + self.date = '0000-00-00' return else: # put the index before the relevant entry @@ -249,6 +265,7 @@ def getDateFromAuthor(self): if i == -1: print('Cannot find submission upload date on author page.') print('Try a different Wayback Machine capture to include the upload date.') + self.date = '0000-00-00' return # scan forward from submission url to a substring mataching the date format (>**/**/**) *=digit @@ -350,7 +367,7 @@ def downloadModern(self): self.wordCount = sandwichMaker(self.pageSource, '"words_count":', ',') self.category = getCategory(sandwichMaker(self.pageSource, '/https://www.literotica.com/c/', '"')) - print('[Retrieved] Submission Metadata') + print('[Retrieved] Submission Metadata') # extract and clean up pageText from pageSource @@ -376,7 +393,7 @@ def downloadClassic(self): self.authorPageUrl = sandwichMaker(self.pageSource,'
','
') @@ -395,7 +412,7 @@ def downloadPreClassic(self): self.authorPageUrl = sandwichMaker(self.pageSource,'class="b-story-user">
') self.getDateFromAuthor() - print('[Retrieved] Submission Metadata') + print('[Retrieved] Submission Metadata') self.pageText = sandwichMaker(self.pageSource,'class="b-story-body">','') @@ -416,7 +433,7 @@ def downloadPostOriginal(self): self.authorPageUrl = sandwichMaker(self.pageSource,'by

','

') + '

' @@ -437,7 +454,7 @@ def downloadOriginal(self): self.authorPageUrl = sandwichMaker(self.pageSource,'by

','') + '

' @@ -445,8 +462,8 @@ def downloadOriginal(self): def download(self): # collect all data needed to export the submission, calling the appropiate download functions for each scheme - print(f"\nDownloading {cleanTitle(self.url)}") - print(self.url) + print(f"\n[Downloading] {cleanTitle(self.url)} ({getSite(self.url)})") + print(f"[URL] {self.url}") # loop through each page of submission and gather all page text while self.currentPage <= self.pages: @@ -476,7 +493,7 @@ def download(self): self.text += self.pageText - print(f"[Saved] Page {self.currentPage} of {self.pages} ({self.scheme} Site Scheme)") + print(f"[Saved] Page {self.currentPage} of {self.pages} ({self.scheme} Site Scheme)") self.currentPage += 1 @@ -495,11 +512,86 @@ def build(self): def export(self): export(self.path, self.filename, self.output) +class xnxx: + + def __init__(self, url): + # immediately defined + self.url = url + self.publisher = "xnxx" + + self.pageSource = '' + + # submission metadata + self.title = '' + self.username = '' + self.series = '' + self.date = '' + self.description = '' + self.category = '' + + # submission text attributes + self.text = '' + self.wordCount = '' + + self.skip = 0 # skipping flag if there is a problem getting page source + # build items + self.output = '' + self.path = '' + self.filename = '' + + # PRIMARY METHODS + def download(self): + # collect all data needed to export the submission + + print(f"\n[Downloading] {cleanTitle(self.url)} ({getSite(self.url)})") + print(f"[URL] {self.url}") + + self.pageSource = getSource(self.url) + if self.pageSource == 'skip': self.skip = 1; return + + title = sandwichMaker(self.pageSource,'

\n','', self.pageSource.find('/profile')) + if self.username == -1: self.username = 'unknown_author' + + description = sandwichMaker(self.pageSource,'

\n','',self.pageSource.find('Introduction:')) + if description == -1: self.description = '' + else: + self.description = (' ').join(list(filter(None, description.replace('\t','').split(' ')))) + + category = sandwichMaker(self.pageSource,'
\n','
').replace('\t','') + self.category = (' ').join(list(filter(None, category.split(' ')))) + + date = sandwichMaker(self.pageSource,'Posted ','
',self.pageSource.find('