'))
+ if date == -1: self.date = '0000-00-00'
+ else:
+ date = list(filter(None, date.replace('\t','').split(' ')))
+ if 'ago' in date: self.date = '0000-00-00'
+ else:
+ self.date = ('-').join([date[-1],getMonth(date[-2]),'0' + date[1][:-2] if int(date[1][:-2])<10 else date[1][:-2]])
+
+ self.text = sandwichMaker(self.pageSource, '
', '', self.pageSource.find(''))
+
+ self.wordCount = wordCount(self.text)
+
+ def build(self):
+ # prepare all retrieved submission data for export
+
+ self.output = getOutput(self)
+ self.path = getPath(self.username)
+ self.filename = getFilename(self.date,self.title)
+
+ def export(self):
+ export(self.path, self.filename, self.output)
# HELPER FUNCTIONS
def sandwichMaker(textSource, topBread, bottomBread, start=0, reverse=0):
# return part of a string between two known substrings (the 'filling' of a sandwich)
+ # returns -1 if cannot find substring
if reverse == 1:
begin = textSource.rfind(topBread) + len(topBread)
@@ -512,34 +604,35 @@ def sandwichMaker(textSource, topBread, bottomBread, start=0, reverse=0):
def cleanTitle(url):
# convert the submission url into a temp title to display while downloading
-
+
+ # xnxx url
+ if 'sexstories.com' in url:
+ if url.endswith('/'): url = url[:-1]
+ return url.split('/')[-1].replace('_', ' ').title()
+
+ # Original Literotica url (no title in url, use number)
if 'literotica.com/stories/showstory.php' in url:
- cleanTitle = f"Title Unknown ({url.split('?')[-1]})"
- else:
+ return f"Title Unknown ({url.split('?')[-1]})"
+
+ # all other Literotica schemes
+ if 'literotica.com' in url:
cleanTitle = url.split('/')[-1].split('-')
# remove unimportant numbers from the end of the title
if len(cleanTitle) > 1:
if cleanTitle[-1].isdigit() and cleanTitle[-2].isdigit():
del cleanTitle[-1]
-
if 'ch' not in cleanTitle and 'pt' not in cleanTitle:
if cleanTitle[-1].isdigit(): del cleanTitle[-1]
- cleanTitle = (' ').join(cleanTitle).title()
-
- site = getSite(url)
- if site == 'Wayback Machine':
- return f"[{getKind(url)}] {cleanTitle} ({site})"
- else:
- return f"[{getKind(url)}] {cleanTitle}"
+ # convert back to strings
+ return (' ').join(cleanTitle).title()
def getSource(url, attempts=0):
# get the webpage html source from a given url and sort all errors
-
# certain errors require a retry, limit them to 5
- if attempts == 5:
+ if attempts == 7:
print('Too many attempts. Try again later.')
return 'skip'
@@ -618,13 +711,18 @@ def getSource(url, attempts=0):
def getPath(username):
# return save path for final export (no spaces in filenames)
- return os.path.join(origCwd, 'lit-submissions', username.replace(' ','_'))
+ return os.path.join(origCwd, 'litstash-saves', username.replace(' ','_'))
def getSite(url):
- # check if the submission is on literotica.com or the wayback machine
+ # check if the submission is from xnxx, literotica, wayback machine
- if 'web.archive.org' in url: return 'Wayback Machine'
- else: return 'Literotica'
+ if 'web.archive.org' in url:
+ if 'literotica.com' in url: return 'Wayback Machine/Literotica'
+ if 'sexstories.com' in url: return 'Wayback Machine/xnxx'
+ else:
+ if 'literotica.com' in url: return 'Literotica'
+ if 'sexstories.com' in url: return 'xnxx'
+ return 'unknown'
def cleanIllustrationSource(obj):
# remove unimportant html tags from page source of illustrations
@@ -711,11 +809,20 @@ def getCategory(category):
def getKind(url):
# check if submission is a story, poem, or illustration
- if '/s/' in url or '/stories/showstory.php' in url: return 'Story'
+ if '/s/' in url or '/stories/showstory.php' in url or 'sexstories.com' in url:
+ return 'Story'
elif '/i/' in url: return 'Illustration'
elif '/p/' in url: return 'Poem'
else: return 'unknown'
+def getMonth(month):
+ month_dict = {
+ 'January' : '01','February' : '02','March' : '03','April' : '04','May' : '05','June' : '06',
+ 'July' : '07','August' : '08','September' : '09','October' : '10','November' : '11','December' : '12'
+ }
+
+ return month_dict[month]
+
def wordCount(text):
# approximate number of words in the submission text
@@ -732,18 +839,28 @@ def cleanHexCodes(text):
def cleanUrl(url):
# ensure each url begins with https:// and a proper domain (some scraped urls are incomplete)
-
- if not 'literotica.com' in url: url = 'https://www.literotica.com' + url
+
+ # detect and fix xnxx url
+ if '/story/' in url:
+ if not 'sexstories.com' in url: url = 'https://www.sexstories.com' + url
+
+ # detect and fix literotica url (if not xnxx, it's literotica)
+ if not 'sexstories.com' in url:
+ if not 'literotica.com' in url: url = 'https://www.literotica.com' + url
+
+ # detect and fix Wayback Machine url
if '/web/' in url:
if not 'web.archive.org' in url: url = 'https://web.archive.org' + url
- # insert im_ after retrieval date in wayback machine urls to create direct download links to resources
+ # insert 'im_' after retrieval date in wayback machine urls to create direct download links to resources
+ # specific to Wayback captures of literotica audios and illustrations
if '/illustra/' in url or '/audio/' in url:
url = url.replace('if_/','im_/')
if 'im_/' not in url:
i = url.rfind('/http')
url = url[:i]+'im_'+url[i:]
-
+
+ # ensure that each url begins with https:// before requesting source
if url.startswith('http://'): url = 'https://' + url[7:]
elif url.startswith('//'): url = 'https:' + url
elif url.startswith('www'): url = 'https://' + url
@@ -765,28 +882,37 @@ def export(path,filename,output):
# return to original working directory
os.chdir(origCwd)
- # print(f"[Finished] Exported to {path}/{filename}")
- print(f"[Finished] Exported to {os.path.join(path, filename)}")
+ print(f"[Finished] Exported to: {os.path.join(path, filename)}")
def getOutput(obj):
# create the ultimate string which will be the final output
header = (
+ f'
{obj.title}\n\n'
+ f'
\n'
+ f'
\n'
+ f'
\n'
+ f'
\n'
+ f'
\n'
+ f'
\n'
+ )
+
+ body_header = (
f'
{obj.url}\n'
f'
{obj.title}\n'
f'
{obj.username}\n'
f'
{obj.wordCount} words || {obj.category} || {obj.date}\n'
f'
{obj.description}\n'
- f'
- - - - - - - - - - - - - -\n'
+ f'
- - - - - - - - - - - - - -\n'
)
output = (
f'\n\n'
- f'\n'
- f'
{obj.title}\n'
+ f'\n\n'
+ f'{header}\n'
f'\n\n'
f'\n\n'
- f'{header}\n'
+ f'{body_header}\n'
f'{obj.text}\n\n'
f'\n\n'
f'\n'
@@ -927,7 +1053,7 @@ def getAudio(pageText, pageSource, title, username):
if len(audioUrls) == 0: pass
else:
- print(f"Detected {len(audioUrls)} audio(s) to download")
+ print(f"[Detected] {len(audioUrls)} audio(s) to download")
for audioUrl in audioUrls:
audioName = audioUrl.split('/')[-1]
@@ -939,7 +1065,7 @@ def getAudio(pageText, pageSource, title, username):
success = saveFile(audioUrl, audioName, savePath,)
if success:
savedCount += 1
- print(f"[Retrieved] {savedCount} of {len(audioUrls)} audio(s) ({skippedCount} Skipped)")
+ print(f"[Retrieved] {savedCount} of {len(audioUrls)} audio(s) ({skippedCount} Skipped)")
# if failure, attempt something else
elif 'web.archive.org' in audioUrl:
@@ -948,7 +1074,7 @@ def getAudio(pageText, pageSource, title, username):
success = saveFile(audioUrl, audioName, savePath,)
if success:
savedCount += 1
- print(f"[Retrieved] {savedCount} of {len(audioUrls)} audio(s) ({skippedCount} Skipped)")
+ print(f"[Retrieved] {savedCount} of {len(audioUrls)} audio(s) ({skippedCount} Skipped)")
else: skippedCount += 1
else: skippedCount += 1
@@ -956,7 +1082,7 @@ def getAudio(pageText, pageSource, title, username):
# search pageSource for audio (if embedded below pageText rather than linked within it)
if audioCount == 0: pass
- else: print(f"Detected {audioCount} audio(s) to download")
+ else: print(f"[Detected] {audioCount} audio(s) to download")
for i in range(audioCount):
@@ -970,14 +1096,14 @@ def getAudio(pageText, pageSource, title, username):
success = saveFile(audioUrl, audioName, savePath)
if success:
savedCount += 1
- print(f"[Retrieved] {savedCount} of {audioCount} audio(s) ({skippedCount} Skipped)")
+ print(f"[Retrieved] {savedCount} of {audioCount} audio(s) ({skippedCount} Skipped)")
elif 'web.archive.org' in audioUrl:
print('Trying a different URL of the same file...')
audioUrl = audioUrl.replace('im_/','if_/')
success = saveFile(audioUrl, audioName, savePath,)
if success:
savedCount += 1
- print(f"[Retrieved] {savedCount} of {len(audioUrls)} audio(s) ({skippedCount} Skipped)")
+ print(f"[Retrieved] {savedCount} of {len(audioUrls)} audio(s) ({skippedCount} Skipped)")
else: skippedCount += 1
else: skippedCount += 1
@@ -988,7 +1114,7 @@ def getImages(text, username):
imageCount = text.count('
Select submissions to download (eg: 1 3 7 12, 2-9, a = all, q = quit)')
- selection = input('--> ')
+ # if user included -a flag, automatically download all detected submissions
+ if a == 0:
+ print('--> Select submissions to download (eg: 1 3 7 12, 2-9, a = all, q = quit)')
+ selection = input('--> ')
+ if a == 1: selection = 'a'
# process user input
if selection == 'q': raise SystemExit
@@ -1091,7 +1224,7 @@ def getList():
skippedCount += 1
if finished == 1:
savedCount += 1
- print(f"[Completed {savedCount} of {len(downloadList)} Submissions ({skippedCount} Skipped)]")
+ print(f"[Completed] {savedCount} of {len(downloadList)} Submissions ({skippedCount} Skipped)")
raise SystemExit
if ' ' in selection:
@@ -1121,10 +1254,11 @@ def getList():
raise SystemExit
def scanForUrls(url):
- # scan any webpage for literotica submissions
+ # scan any webpage for literotica or xnxx submissions
- print('Scanning given page for Literotica Submissions... ', end='')
+ print('Scanning given page for Submissions... ', end='')
+ xnxxStoryCount = 0
storyCount = 0
originalStoryCount = 0
illustrationCount = 0
@@ -1133,11 +1267,31 @@ def scanForUrls(url):
pageSource = getSource(url)
+ totalXnxxStories = pageSource.count('/story/')
totalStories = pageSource.count('literotica.com/s/')
totalOriginalStories = pageSource.count('literotica.com/stories/showstory.php')
totalIllustrations = pageSource.count('literotica.com/i/')
totalPoems = pageSource.count('literotica.com/p/')
+ # scan for xnxx story urls
+ i = 0
+ while xnxxStoryCount < totalXnxxStories:
+
+ i = pageSource.find('/story/', i)
+
+ # find beginning and end of story URL
+ while pageSource[i-1] != '"': i -= 1
+ beg = i
+ while pageSource[i] != '"': i += 1
+ end = i
+
+ url = cleanUrl(pageSource[beg:end])
+
+ if url not in downloadList:
+ downloadList.append(url)
+
+ xnxxStoryCount += 1
+
# scan for story urls
i = 0
while storyCount < totalStories:
@@ -1214,12 +1368,17 @@ def scanForUrls(url):
poemCount += 1
- totalCount = storyCount+illustrationCount+poemCount
+ totalCount = xnxxStoryCount+storyCount+illustrationCount+poemCount
if totalCount != 0: print(f"found {totalCount}")
+ else: print('')
def parseArgs(args):
# parse all user arguments from the command line (urls or optional flags)
+
+ # use 'a' as a toggle to auto-download all submissions if -a in arguments
+ global a
+ a = 0
if len(args) == 0: print(usage); raise SystemExit
@@ -1228,6 +1387,7 @@ def parseArgs(args):
if arg == '-h' or arg == '--help':
print(usage)
raise SystemExit
+ elif arg == '-a': a = 1
elif arg == '--version':
print(version)
print(updated)
@@ -1240,13 +1400,14 @@ def parseArgs(args):
if arg not in downloadList: downloadList.append(arg)
elif 'literotica.com/stories/showstory.php' in arg:
if arg not in downloadList: downloadList.append(arg)
+ elif 'sexstories.com/story/' in arg:
+ if arg not in downloadList: downloadList.append(arg)
else:
scanForUrls(arg)
# send downloadList to be comprehended and downloaded
getList()
-
# START PROGRAM
def main():
diff --git a/readMe.txt b/readMe.txt
index bee981e..a32bc1e 100644
--- a/readMe.txt
+++ b/readMe.txt
@@ -1,5 +1,4 @@
-
LITSTASH -- readMe
--------------------------
Created by NocturnalNebula
@@ -12,11 +11,14 @@
DESCRIPTION
'''''''''''
-This python script can download any submission from Literotica or from Wayback
-Machine captures of submissions. Submissions are saved in HTML format with
-embedded images, audios, and style tags preserved. This program can download
-batches of submissions from author pages, favorite pages, search results, or
-any other page containing Literotica URLs (or Wayback Machine captures).
+This python script can download submissions from Literotica and stories from
+xnxx. It also supports Wayback Machine captures from either site. Submissions
+are saved in HTML format with embedded images, audios, and style tags preserved.
+Additionally, litstash can download batches of submissions from author pages,
+favorite pages, search results, or any other page containing submission URLs.
+
+Outputs include story metadata (author, series, date, etc...) for easy
+organization or conversion to epub in an ebook manager.
REQUIREMENTS
''''''''''''
@@ -48,9 +50,10 @@ In a terminal, clone the repository, enter it and run the script with:
You may need to type 'python3' instead of 'python' if you have multiple
versions of Python installed. Literotica URLs should be surrounded by
"quotes" because an '&' can confuse the shell. You can include multiple
-URLs. URLs can be any Literotica submissions, Wayback Machine captures of
-submissions, or any page containing links to submissions (or captures).
-Submissions will be exported to ".../lit-submissions/username/submission.html".
+URLs. URLs can be any Literotica submission or xnxx stories, any page containing
+links to either of the above (e.g. author pages), or Wayback Machine captures
+of any of the above. Submissions will be exported to
+".../litstash-saves/username/___.html".
EXAMPLE COMMANDS:
diff --git a/versionChanges.txt b/versionChanges.txt
index d3f22a2..7d1a5e0 100644
--- a/versionChanges.txt
+++ b/versionChanges.txt
@@ -1,4 +1,13 @@
+1.4 - December 2023
+'''''''''''''''''''
+- outputs now include story metadata (author, date, tags, series, etc...)
+- created auto-download-all flag '-a' for batch downloads
+- added support for xnxx stories (including author pages and Wayback Machine)
+- output path is now '.../litstash-saves/username/___.html'
+- cleaned up output during downloading
+- unknown upload dates default to '0000-00-00'
+
1.3 - September 2023
''''''''''''''''''''
- complete program re-rewrite (now using classes rather than just functions)