From d704e8286cb924116606fedb4648ad16f6da6191 Mon Sep 17 00:00:00 2001 From: Commandcracker <49335821+Commandcracker@users.noreply.github.com> Date: Wed, 12 Jun 2024 19:26:01 +0200 Subject: [PATCH] Make sh** faster (some cashing, regx improvements, orjson support), fix search lagging and crashing --- README.md | 9 ++- pyproject.toml | 12 ++-- src/gucken/__init__.py | 2 +- src/gucken/aniskip.py | 3 +- src/gucken/gucken.py | 92 ++++++++++++++++++++--------- src/gucken/hoster/doodstream.py | 31 +++------- src/gucken/hoster/streamtape.py | 17 ++---- src/gucken/hoster/veo.py | 8 +-- src/gucken/hoster/vidoza.py | 7 +-- src/gucken/networking.py | 8 +-- src/gucken/provider/aniworld.py | 6 +- src/gucken/provider/common.py | 3 + src/gucken/provider/serienstream.py | 7 ++- src/gucken/tracker/anilist.py | 3 +- src/gucken/tracker/myanimelist.py | 3 +- src/gucken/update.py | 3 +- src/gucken/utils.py | 8 +++ 17 files changed, 133 insertions(+), 89 deletions(-) diff --git a/README.md b/README.md index 00d4e1b..086237f 100644 --- a/README.md +++ b/README.md @@ -159,7 +159,9 @@ Place your custom CSS in `user_config_path("gucken").joinpath("custom.css")` and ## Optional dependencies -- `levenshtein` - Faster fuzzy sort/search. (with: `gucken[levenshtein]`) +- `speedups` (with: `gucken[speedups]`) + - Faster fuzzy sort/search. (`levenshtein`) + - Faster json parsing. (`orjson`) - `socks` - SOCKS proxy support. (with: `gucken[socks]`) ## Todo @@ -196,6 +198,7 @@ selenium or playwright ### UX +- [ ] Add hotkey to clear cache (F5) - [ ] Translation DE, EN - [ ] Improve settings design - [ ] Merge SerienStream.to and AniWorld.to search results @@ -217,8 +220,9 @@ selenium or playwright ### Speedups - [ ] Pre-fetching -- [ ] Caching - [ ] More threads and asyncio.gather to make everything faster +- [ ] More Caching +- [ ] Reuse Client ### Code @@ -262,7 +266,6 @@ selenium or playwright ### Bugs & DX -- [ ] FIX TYPING SOMETIMES CAUSES CRASH - [ ] Proper error handling - [ ] Logging and Crash reports - [ ] Blacklist detection & bypass diff --git a/pyproject.toml b/pyproject.toml index e61784c..61a8474 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,14 +7,15 @@ maintainers = [{name="Commandcracker"}] license = {file = "LICENSE.txt"} readme = "README.md" dependencies = [ - "textual>=0.64.0", + "textual>=0.67.0", "beautifulsoup4>=4.12.3", "httpx[http2]>=0.27.0", "pypresence>=4.3.0", - "packaging>=24.0", + "packaging>=24.1", "platformdirs>=4.2.2", "toml>=0.10.2", - "fuzzywuzzy>=0.18.0" + "fuzzywuzzy>=0.18.0", + "async_lru>=2.0.4" #"yt-dlp>=2024.4.9", #"mpv>=1.0.6", ] @@ -47,7 +48,10 @@ classifiers = [ ] [project.optional-dependencies] -levenshtein = ["levenshtein>=0.25.1"] +speedups = [ + "levenshtein>=0.25.1", + "orjson>=3.10.4" +] socks = ["httpx[socks]>=0.27.0"] [project.urls] diff --git a/src/gucken/__init__.py b/src/gucken/__init__.py index 6044c5e..7fbad47 100644 --- a/src/gucken/__init__.py +++ b/src/gucken/__init__.py @@ -1,4 +1,4 @@ import warnings warnings.filterwarnings('ignore', message='Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') -__version__ = "0.1.12" +__version__ = "0.2.0" diff --git a/src/gucken/aniskip.py b/src/gucken/aniskip.py index 1ccd286..4c3854c 100644 --- a/src/gucken/aniskip.py +++ b/src/gucken/aniskip.py @@ -7,6 +7,7 @@ from .networking import AsyncClient from .tracker.myanimelist import search from .rome import replace_roman_numerals +from .utils import json_loads @dataclass @@ -24,7 +25,7 @@ async def get_timings_from_id( response = await client.get( f"https://api.aniskip.com/v1/skip-times/{anime_id}/{episode_number}?types=op&types=ed" ) - json = response.json() + json = json_loads(response.content) if json.get("found") is not True: return op_start = 0 diff --git a/src/gucken/gucken.py b/src/gucken/gucken.py index 181c3ff..6d9a859 100644 --- a/src/gucken/gucken.py +++ b/src/gucken/gucken.py @@ -2,7 +2,7 @@ from textual.command import Hits, Provider as TextualProvider, Hit, DiscoveryHit import argparse import logging -from asyncio import gather +from asyncio import gather, set_event_loop, new_event_loop from atexit import register as register_atexit from os import remove, name as os_name from os.path import join @@ -12,6 +12,7 @@ from subprocess import DEVNULL, PIPE, Popen from time import sleep, time from typing import ClassVar, List, Union +from async_lru import alru_cache from fuzzywuzzy import fuzz from platformdirs import user_config_path, user_log_path @@ -39,6 +40,7 @@ TabbedContent, TabPane, ) +from textual.worker import get_current_worker from .aniskip import ( generate_chapters_file, @@ -375,15 +377,8 @@ def compose(self) -> ComposeResult: @on(Input.Changed) async def input_changed(self, event: Input.Changed): - id = event.control.id - value = event.value - - if id == "input": - if value: - self.lookup_anime(value) - else: - # TODO: fix sometimes wont clear - await self.query_one("#results", ListView).clear() + if event.control.id == "input": + self.lookup_anime(event.value) @on(SortableTable.SortChanged) async def sortableTable_sortChanged( @@ -507,21 +502,58 @@ async def disable_RPC(self): self.RPC.sock_writer.close() self.RPC = None - # TODO: https://textual.textualize.io/guide/workers/#thread-workers + @alru_cache(maxsize=64, ttl=600) # Cache 64 entries. Clear entry after 10 minutes. + async def aniworld_search(self, keyword: str) -> Union[list[SearchResult], None]: + return await AniWorldProvider.search(keyword) + + @alru_cache(maxsize=64, ttl=600) # Cache 64 entries. Clear entry after 10 minutes. + async def serienstream_search(self, keyword: str) -> Union[list[SearchResult], None]: + return await SerienStreamProvider.search(keyword) + + def sync_gather(self, tasks: list): + async def gather_all(): + return await gather(*tasks) + + loop = new_event_loop() + set_event_loop(loop) + return loop.run_until_complete(gather_all()) + # TODO: Exit on error when debug = true - @work(exclusive=True) #exit_on_error=False - async def lookup_anime(self, keyword: str) -> None: + # TODO: sometimes not removing loading state + # TODO: FIX + """ + sys:1: RuntimeWarning: coroutine '_LRUCacheWrapperInstanceMethod.__call__' was never awaited + RuntimeWarning: Enable tracemalloc to get the object allocation traceback + """ + + @work(exclusive=True, thread=True, exit_on_error=False) + def lookup_anime(self, keyword: str) -> None: + results_list_view = self.query_one("#results", ListView) + worker = get_current_worker() + + if keyword is None: + if not worker.is_cancelled: + self.call_from_thread(results_list_view.clear) + results_list_view.loading = False + return + + aniworld_to = self.query_one("#aniworld_to", Checkbox).value + serienstream_to = self.query_one("#serienstream_to", Checkbox).value + search_providers = [] - if self.query_one("#aniworld_to", Checkbox).value: - search_providers.append(AniWorldProvider.search(keyword)) - if self.query_one("#serienstream_to", Checkbox).value: - search_providers.append(SerienStreamProvider.search(keyword)) + if aniworld_to: + search_providers.append(self.aniworld_search(keyword)) + if serienstream_to: + search_providers.append(self.serienstream_search(keyword)) - results_list_view = self.query_one("#results", ListView) - await results_list_view.clear() + if worker.is_cancelled: + return + self.call_from_thread(results_list_view.clear) results_list_view.loading = True - results = await gather(*search_providers) + if worker.is_cancelled: + return + results = self.sync_gather(search_providers) final_results = [] for l in results: if l is not None: @@ -542,7 +574,9 @@ def fuzzy_sort_key(result): f"\n{series.description}" ) )) - await results_list_view.extend(items) + if worker.is_cancelled: + return + self.call_from_thread(results_list_view.extend, items) results_list_view.loading = False if len(final_results) > 0: @@ -552,7 +586,7 @@ def select_first_index(): except AssertionError: pass - self.app.call_later(select_first_index) + self.call_later(select_first_index) async def on_key(self, event: events.Key) -> None: key = event.key @@ -597,6 +631,10 @@ async def play_selected(self): ) dt.loading = False + @alru_cache(maxsize=32, ttl=600) # Cache 32 entries. Clear entry after 10 minutes. + async def get_series(self, series_search_result: SearchResult): + return await series_search_result.get_series() + @work(exclusive=True) async def open_info(self) -> None: series_search_result: SearchResult = self.current[ @@ -605,13 +643,14 @@ async def open_info(self) -> None: info_tab = self.query_one("#info", TabPane) info_tab.disabled = False info_tab.loading = True - self.query_one(TabbedContent).active = "info" + table = self.query_one("#season_list", DataTable) + table.focus(scroll_visible=False) md = self.query_one("#markdown", Markdown) - series = await series_search_result.get_series() + + series = await self.get_series(series_search_result) self.current_info = series await md.update(series.to_markdown()) - table = self.query_one("#season_list", DataTable) table.clear() c = 0 for ep in series.episodes: @@ -632,7 +671,6 @@ async def open_info(self) -> None: " ".join(sort_favorite_hoster_by_key(hl, self.hoster)), " ".join(ll), ) - table.focus(scroll_visible=False) info_tab.loading = False @work(exclusive=True, thread=True) @@ -902,7 +940,7 @@ def main(): if args.debug: logs_path = user_log_path("gucken", ensure_exists=True) logging.basicConfig( - filename=logs_path.joinpath("gucken.log"), encoding="utf-8", level=logging.INFO + filename=logs_path.joinpath("gucken.log"), encoding="utf-8", level=logging.INFO, force=True ) register_atexit(gucken_settings_manager.save) diff --git a/src/gucken/hoster/doodstream.py b/src/gucken/hoster/doodstream.py index eae79c9..83254b1 100644 --- a/src/gucken/hoster/doodstream.py +++ b/src/gucken/hoster/doodstream.py @@ -3,12 +3,11 @@ from re import compile as re_compile from string import ascii_letters, digits from time import time -from urllib.parse import urlparse from ..networking import AsyncClient from .common import DirectLink, Hoster -EXTRACT_DOODSTREAM_HLS_PATTERN = re_compile(r"/pass_md5/[\w-]+/[\w-]+") +EXTRACT_DOODSTREAM_HLS_PATTERN = re_compile(r"/pass_md5/[\w-]+/(?P[\w-]+)") def random_str(length: int = 10) -> str: @@ -19,30 +18,18 @@ def js_date_now() -> int: return int(time() * 1000) -headers = {"Referer": "https://d0000d.com/"} - - @dataclass class DoodstreamHoster(Hoster): requires_headers = True async def get_direct_link(self) -> DirectLink: - async with AsyncClient(verify=False) as client: - response = await client.head(self.url) - if response.has_redirect_location: - u2 = ( - urlparse(response.headers.get("Location")) - ._replace(netloc="d000d.com") - .geturl() - ) - response = await client.get(u2) - - pass_md5 = EXTRACT_DOODSTREAM_HLS_PATTERN.search(response.text) - response = await client.get( - f"https://d0000d.com{pass_md5.group()}", - headers={"Referer": "https://d0000d.com/"}, - ) + async with AsyncClient(verify=False, auto_referer=True) as client: + response1 = await client.get(self.url) + match = EXTRACT_DOODSTREAM_HLS_PATTERN.search(response1.text) + + # Require Referer + response2 = await client.get(str(response1.url.copy_with(path=match.group()))) return DirectLink( - url=f"{response.text}{random_str()}?token={pass_md5.group().split('/')[-1]}&expiry={js_date_now()}", - headers=headers, + url=f"{response2.text}{random_str()}?token={match.group("token")}&expiry={js_date_now()}", + headers={"Referer": str(response2.url.copy_with(path="/"))}, ) diff --git a/src/gucken/hoster/streamtape.py b/src/gucken/hoster/streamtape.py index 9f64ed8..d8d6899 100644 --- a/src/gucken/hoster/streamtape.py +++ b/src/gucken/hoster/streamtape.py @@ -3,9 +3,7 @@ from ..networking import AsyncClient from .common import DirectLink, Hoster -STREAMTAPE_PATTERN = re_compile(r"botlink(.*?)innerHTML(.*?)\);") -STREAMTAPE_PATTERN_SUBSTRING = re_compile(r"substring\(\d+") -STREAMTAPE_PATTERN_DIGETS = re_compile(r"\d+") +STREAMTAPE_PATTERN = re_compile(r"'botlink.*innerHTML.*?'(?P.*)'.*?\+.*?'(?P.*)'") class StreamtapeHoster(Hoster): @@ -13,17 +11,10 @@ async def get_direct_link(self) -> DirectLink: # TODO: Error checking async with AsyncClient(verify=False) as client: response = await client.get(self.url) + # TODO: Save html and error in order to investigate # with open("out.txt", "wb") as f: # f.write(response.text.encode('utf-8')) - video_src = STREAMTAPE_PATTERN.search(response.text) - j1 = "".join(video_src.groups()) - u1 = j1.split(" ")[2][1:-2] - u2 = j1[j1.index("('") + 2 : j1.rfind("')")] - - matches = STREAMTAPE_PATTERN_SUBSTRING.findall(j1) - for match in matches: - sub = STREAMTAPE_PATTERN_DIGETS.search(match).group(0) - u2 = u2[int(sub) :] - return DirectLink(f"https:{u1}{u2}") + match = STREAMTAPE_PATTERN.search(response.text) + return DirectLink(f"https:{match.group("s1")}{match.group('s2')[4:]}") diff --git a/src/gucken/hoster/veo.py b/src/gucken/hoster/veo.py index 7968eba..84afc2c 100644 --- a/src/gucken/hoster/veo.py +++ b/src/gucken/hoster/veo.py @@ -4,13 +4,13 @@ from ..networking import AsyncClient from .common import DirectLink, Hoster -EXTRACT_VEO_HLS_PATTERN = re_compile(r"'hls': '(.*?)'") +EXTRACT_VEO_HLS_PATTERN = re_compile(r"'hls': '(?P.*)'") class VOEHoster(Hoster): async def get_direct_link(self) -> DirectLink: async with AsyncClient(verify=False) as client: response = await client.get(self.url) - match_hls = EXTRACT_VEO_HLS_PATTERN.search(response.text) - hls_link = match_hls.group(1) - return DirectLink(b64decode(hls_link).decode()) + match = EXTRACT_VEO_HLS_PATTERN.search(response.text) + link = match.group("hls") + return DirectLink(b64decode(link).decode()) diff --git a/src/gucken/hoster/vidoza.py b/src/gucken/hoster/vidoza.py index a6c9654..06a4324 100644 --- a/src/gucken/hoster/vidoza.py +++ b/src/gucken/hoster/vidoza.py @@ -4,9 +4,8 @@ from .common import DirectLink, Hoster -# TODO: improve all patterns EXTRACT_VIDOZA_HLS_PATTERN = re_compile( - r"sourcesCode:.*?\[.*?\{.*?src:.*?[\'|\"](?P.*?)[\'|\"]," + r"sourcesCode:.*?\[.*?\{.*?src:.*?[\'|\"](?P.*?)[\'|\"]," ) @@ -14,5 +13,5 @@ class VidozaHoster(Hoster): async def get_direct_link(self) -> DirectLink: async with AsyncClient(verify=False) as client: response = await client.get(self.url) - match_hls = EXTRACT_VIDOZA_HLS_PATTERN.search(response.text) - return DirectLink(match_hls.group(1)) + match = EXTRACT_VIDOZA_HLS_PATTERN.search(response.text) + return DirectLink(match.group("mp4")) diff --git a/src/gucken/networking.py b/src/gucken/networking.py index 5b37591..de0da4b 100644 --- a/src/gucken/networking.py +++ b/src/gucken/networking.py @@ -1,5 +1,5 @@ from enum import Enum -from json import loads +from .utils import json_loads from pathlib import Path from random import choice from urllib.parse import urlparse @@ -17,7 +17,7 @@ user_agents_path = Path(__file__).parent.joinpath("resources", "user_agents.json") with open(user_agents_path, "r") as f: user_agents_raw = f.read() -user_agents = loads(user_agents_raw) +user_agents = json_loads(user_agents_raw) class AsyncHTTPSRedirectTransport(AsyncBaseTransport): @@ -106,10 +106,10 @@ async def request(self, *args, **kwargs) -> Response: async def main(): async with AsyncClient() as client: response = await client.get("https://httpbin.org/headers") - print(response.json()) + print(json_loads(response.content)) async with HttpxAsyncClient() as client: response = await client.get("https://httpbin.org/headers") - print(response.json()) + print(json_loads(response.content)) if __name__ == "__main__": run(main()) diff --git a/src/gucken/provider/aniworld.py b/src/gucken/provider/aniworld.py index b038ed5..8787541 100644 --- a/src/gucken/provider/aniworld.py +++ b/src/gucken/provider/aniworld.py @@ -11,6 +11,7 @@ from ..hoster.veo import VOEHoster from ..hoster.vidoza import VidozaHoster from .common import Episode, Hoster, Language, Provider, SearchResult, Series +from ..utils import json_loads def provider_to_hoster(provider: str, url: str) -> Hoster: @@ -122,6 +123,9 @@ async def get_series(self) -> AniWorldSeries: def url(self) -> str: return f"https://{self.host}/anime/stream/{self.link}" + def __hash__(self): + return super().__hash__() + @dataclass class AniWorldProvider(Provider): @@ -135,7 +139,7 @@ async def search(keyword: str) -> Union[list[AniWorldSearchResult], None]: response = await client.get( f"https://{AniWorldProvider.host}/ajax/seriesSearch?keyword={keyword}" ) - results = response.json() + results = json_loads(response.content) search_results = [] for series in results: search_results.append( diff --git a/src/gucken/provider/common.py b/src/gucken/provider/common.py index cc54d86..db83b5b 100644 --- a/src/gucken/provider/common.py +++ b/src/gucken/provider/common.py @@ -64,6 +64,9 @@ async def get_series(self) -> Series: def url(self) -> str: raise NotImplementedError + def __hash__(self): + return hash(self.provider_name + self.name + self.description) + class Provider(ABC): diff --git a/src/gucken/provider/serienstream.py b/src/gucken/provider/serienstream.py index bcb262c..9c1790f 100644 --- a/src/gucken/provider/serienstream.py +++ b/src/gucken/provider/serienstream.py @@ -11,10 +11,10 @@ from ..hoster.veo import VOEHoster from ..hoster.vidoza import VidozaHoster from .common import Episode, Hoster, Language, Provider, SearchResult, Series +from ..utils import json_loads # TODO: Timeouts # TODO: use base_url -# TODO: faster json # TODO: reuse same client # TODO: do serienstream resolve using mounts (remove veryfy fale from hosts) @@ -133,6 +133,9 @@ async def get_series(self) -> SerienStreamSeries: def url(self) -> str: return f"https://{self.host}/serie/stream/{self.link}" + def __hash__(self): + return super().__hash__() + @dataclass class SerienStreamProvider(Provider): @@ -146,7 +149,7 @@ async def search(keyword: str) -> Union[list[SerienStreamSearchResult], None]: response = await client.get( f"https://{SerienStreamProvider.host}/ajax/seriesSearch?keyword={keyword}", headers=headers, extensions=extensions ) - results = response.json() + results = json_loads(response.content) search_results = [] for series in results: search_results.append( diff --git a/src/gucken/tracker/anilist.py b/src/gucken/tracker/anilist.py index 3432c5c..ba4bf56 100644 --- a/src/gucken/tracker/anilist.py +++ b/src/gucken/tracker/anilist.py @@ -1,4 +1,5 @@ from ..networking import AsyncClient +from ..utils import json_loads SEARCH_QUERY = """ query ($id: Int, $page: Int, $perPage: Int, $search: String) { @@ -30,4 +31,4 @@ async def search(keyword: str) -> dict: headers={"Content-Type": "application/json"}, json={"query": SEARCH_QUERY, "variables": {"search": keyword}}, ) - return response.json() + return json_loads(response.content) diff --git a/src/gucken/tracker/myanimelist.py b/src/gucken/tracker/myanimelist.py index 02039c8..0e023ba 100644 --- a/src/gucken/tracker/myanimelist.py +++ b/src/gucken/tracker/myanimelist.py @@ -1,4 +1,5 @@ from ..networking import AsyncClient +from ..utils import json_loads async def search(keyword: str) -> dict: @@ -6,4 +7,4 @@ async def search(keyword: str) -> dict: response = await client.get( f"https://myanimelist.net/search/prefix.json?type=anime&keyword={keyword}" ) - return response.json() + return json_loads(response.content) diff --git a/src/gucken/update.py b/src/gucken/update.py index 754f9a3..0b63df9 100644 --- a/src/gucken/update.py +++ b/src/gucken/update.py @@ -5,6 +5,7 @@ from packaging.version import Version from . import __version__ as current_version +from .utils import json_loads PACKAGE_NAME = "gucken" @@ -18,7 +19,7 @@ class UpdateResult: async def get_latest_version(): async with AsyncClient() as client: response = await client.get(f"https://pypi.org/pypi/{PACKAGE_NAME}/json") - return response.json().get("info").get("version") + return json_loads(response.content).get("info").get("version") async def check() -> Union[UpdateResult, None]: diff --git a/src/gucken/utils.py b/src/gucken/utils.py index 8be7576..75984ef 100644 --- a/src/gucken/utils.py +++ b/src/gucken/utils.py @@ -1,3 +1,4 @@ +#import logging import os import sys @@ -14,6 +15,13 @@ is_android = hasattr(sys, "getandroidapilevel") +try: + from orjson import loads as json_loads + # logging.debug("Using orjson") +except ImportError: + from json import loads as json_loads + # logging.debug("Using default json") + def detect_player() -> Union[Player, None]: if is_android: