Make sh** faster (some cashing, regx improvements, orjson support), f…

…ix search lagging and crashing
Commandcracker · Jun 12, 2024 · d704e82 · d704e82
1 parent 187e1ff
commit d704e82
Show file tree

Hide file tree

Showing 17 changed files with 133 additions and 89 deletions.
diff --git a/README.md b/README.md
@@ -159,7 +159,9 @@ Place your custom CSS in `user_config_path("gucken").joinpath("custom.css")` and
 
 ## Optional dependencies
 
-- `levenshtein` - Faster fuzzy sort/search. (with: `gucken[levenshtein]`)
+- `speedups` (with: `gucken[speedups]`)
+  - Faster fuzzy sort/search. (`levenshtein`)
+  - Faster json parsing. (`orjson`)
 - `socks` - SOCKS proxy support. (with: `gucken[socks]`)
 
 ## Todo
@@ -196,6 +198,7 @@ selenium or playwright
 
 ### UX
 
+- [ ] Add hotkey to clear cache (F5)
 - [ ] Translation DE, EN
 - [ ] Improve settings design
 - [ ] Merge SerienStream.to and AniWorld.to search results
@@ -217,8 +220,9 @@ selenium or playwright
 ### Speedups
 
 - [ ] Pre-fetching
-- [ ] Caching
 - [ ] More threads and asyncio.gather to make everything faster
+- [ ] More Caching
+- [ ] Reuse Client
 
 ### Code
 
@@ -262,7 +266,6 @@ selenium or playwright
 
 ### Bugs & DX
 
-- [ ] FIX TYPING SOMETIMES CAUSES CRASH
 - [ ] Proper error handling
 - [ ] Logging and Crash reports
 - [ ] Blacklist detection & bypass

diff --git a/pyproject.toml b/pyproject.toml
@@ -7,14 +7,15 @@ maintainers = [{name="Commandcracker"}]
 license = {file = "LICENSE.txt"}
 readme = "README.md"
 dependencies = [
-  "textual>=0.64.0",
+  "textual>=0.67.0",
   "beautifulsoup4>=4.12.3",
   "httpx[http2]>=0.27.0",
   "pypresence>=4.3.0",
-  "packaging>=24.0",
+  "packaging>=24.1",
   "platformdirs>=4.2.2",
   "toml>=0.10.2",
-  "fuzzywuzzy>=0.18.0"
+  "fuzzywuzzy>=0.18.0",
+  "async_lru>=2.0.4"
   #"yt-dlp>=2024.4.9",
   #"mpv>=1.0.6",
 ]
@@ -47,7 +48,10 @@ classifiers = [
 ]
 
 [project.optional-dependencies]
-levenshtein = ["levenshtein>=0.25.1"]
+speedups = [
+  "levenshtein>=0.25.1",
+  "orjson>=3.10.4"
+]
 socks = ["httpx[socks]>=0.27.0"]
 
 [project.urls]

diff --git a/src/gucken/__init__.py b/src/gucken/__init__.py
@@ -1,4 +1,4 @@
 import warnings
 warnings.filterwarnings('ignore', message='Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
 
-__version__ = "0.1.12"
+__version__ = "0.2.0"
diff --git a/src/gucken/aniskip.py b/src/gucken/aniskip.py
@@ -7,6 +7,7 @@
 from .networking import AsyncClient
 from .tracker.myanimelist import search
 from .rome import replace_roman_numerals
+from .utils import json_loads
 
 
 @dataclass
@@ -24,7 +25,7 @@ async def get_timings_from_id(
         response = await client.get(
             f"https://api.aniskip.com/v1/skip-times/{anime_id}/{episode_number}?types=op&types=ed"
         )
-        json = response.json()
+        json = json_loads(response.content)
         if json.get("found") is not True:
             return
         op_start = 0

diff --git a/src/gucken/gucken.py b/src/gucken/gucken.py
@@ -2,7 +2,7 @@
 from textual.command import Hits, Provider as TextualProvider, Hit, DiscoveryHit
 import argparse
 import logging
-from asyncio import gather
+from asyncio import gather, set_event_loop, new_event_loop
 from atexit import register as register_atexit
 from os import remove, name as os_name
 from os.path import join
@@ -12,6 +12,7 @@
 from subprocess import DEVNULL, PIPE, Popen
 from time import sleep, time
 from typing import ClassVar, List, Union
+from async_lru import alru_cache
 
 from fuzzywuzzy import fuzz
 from platformdirs import user_config_path, user_log_path
@@ -39,6 +40,7 @@
     TabbedContent,
     TabPane,
 )
+from textual.worker import get_current_worker
 
 from .aniskip import (
     generate_chapters_file,
@@ -375,15 +377,8 @@ def compose(self) -> ComposeResult:
 
     @on(Input.Changed)
     async def input_changed(self, event: Input.Changed):
-        id = event.control.id
-        value = event.value
-
-        if id == "input":
-            if value:
-                self.lookup_anime(value)
-            else:
-                # TODO: fix sometimes wont clear
-                await self.query_one("#results", ListView).clear()
+        if event.control.id == "input":
+            self.lookup_anime(event.value)
 
     @on(SortableTable.SortChanged)
     async def sortableTable_sortChanged(
@@ -507,21 +502,58 @@ async def disable_RPC(self):
             self.RPC.sock_writer.close()
             self.RPC = None
 
-    # TODO: https://textual.textualize.io/guide/workers/#thread-workers
+    @alru_cache(maxsize=64, ttl=600)  # Cache 64 entries. Clear entry after 10 minutes.
+    async def aniworld_search(self, keyword: str) -> Union[list[SearchResult], None]:
+        return await AniWorldProvider.search(keyword)
+
+    @alru_cache(maxsize=64, ttl=600)  # Cache 64 entries. Clear entry after 10 minutes.
+    async def serienstream_search(self, keyword: str) -> Union[list[SearchResult], None]:
+        return await SerienStreamProvider.search(keyword)
+
+    def sync_gather(self, tasks: list):
+        async def gather_all():
+            return await gather(*tasks)
+
+        loop = new_event_loop()
+        set_event_loop(loop)
+        return loop.run_until_complete(gather_all())
+
     # TODO: Exit on error when debug = true
-    @work(exclusive=True)  #exit_on_error=False
-    async def lookup_anime(self, keyword: str) -> None:
+    # TODO: sometimes not removing loading state
+    # TODO: FIX
+    """
+    sys:1: RuntimeWarning: coroutine '_LRUCacheWrapperInstanceMethod.__call__' was never awaited
+    RuntimeWarning: Enable tracemalloc to get the object allocation traceback
+    """
+
+    @work(exclusive=True, thread=True, exit_on_error=False)
+    def lookup_anime(self, keyword: str) -> None:
+        results_list_view = self.query_one("#results", ListView)
+        worker = get_current_worker()
+
+        if keyword is None:
+            if not worker.is_cancelled:
+                self.call_from_thread(results_list_view.clear)
+                results_list_view.loading = False
+            return
+
+        aniworld_to = self.query_one("#aniworld_to", Checkbox).value
+        serienstream_to = self.query_one("#serienstream_to", Checkbox).value
+
         search_providers = []
-        if self.query_one("#aniworld_to", Checkbox).value:
-            search_providers.append(AniWorldProvider.search(keyword))
 
-        if self.query_one("#serienstream_to", Checkbox).value:
-            search_providers.append(SerienStreamProvider.search(keyword))
+        if aniworld_to:
+            search_providers.append(self.aniworld_search(keyword))
+        if serienstream_to:
+            search_providers.append(self.serienstream_search(keyword))
 
-        results_list_view = self.query_one("#results", ListView)
-        await results_list_view.clear()
+        if worker.is_cancelled:
+            return
+        self.call_from_thread(results_list_view.clear)
         results_list_view.loading = True
-        results = await gather(*search_providers)
+        if worker.is_cancelled:
+            return
+        results = self.sync_gather(search_providers)
         final_results = []
         for l in results:
             if l is not None:
@@ -542,7 +574,9 @@ def fuzzy_sort_key(result):
                         f"\n{series.description}"
                     )
                 ))
-            await results_list_view.extend(items)
+            if worker.is_cancelled:
+                return
+            self.call_from_thread(results_list_view.extend, items)
         results_list_view.loading = False
         if len(final_results) > 0:
 
@@ -552,7 +586,7 @@ def select_first_index():
                 except AssertionError:
                     pass
 
-            self.app.call_later(select_first_index)
+            self.call_later(select_first_index)
 
     async def on_key(self, event: events.Key) -> None:
         key = event.key
@@ -597,6 +631,10 @@ async def play_selected(self):
         )
         dt.loading = False
 
+    @alru_cache(maxsize=32, ttl=600)  # Cache 32 entries. Clear entry after 10 minutes.
+    async def get_series(self, series_search_result: SearchResult):
+        return await series_search_result.get_series()
+
     @work(exclusive=True)
     async def open_info(self) -> None:
         series_search_result: SearchResult = self.current[
@@ -605,13 +643,14 @@ async def open_info(self) -> None:
         info_tab = self.query_one("#info", TabPane)
         info_tab.disabled = False
         info_tab.loading = True
-        self.query_one(TabbedContent).active = "info"
+        table = self.query_one("#season_list", DataTable)
+        table.focus(scroll_visible=False)
         md = self.query_one("#markdown", Markdown)
-        series = await series_search_result.get_series()
+
+        series = await self.get_series(series_search_result)
         self.current_info = series
         await md.update(series.to_markdown())
 
-        table = self.query_one("#season_list", DataTable)
         table.clear()
         c = 0
         for ep in series.episodes:
@@ -632,7 +671,6 @@ async def open_info(self) -> None:
                 " ".join(sort_favorite_hoster_by_key(hl, self.hoster)),
                 " ".join(ll),
             )
-        table.focus(scroll_visible=False)
         info_tab.loading = False
 
     @work(exclusive=True, thread=True)
@@ -902,7 +940,7 @@ def main():
     if args.debug:
         logs_path = user_log_path("gucken", ensure_exists=True)
         logging.basicConfig(
-            filename=logs_path.joinpath("gucken.log"), encoding="utf-8", level=logging.INFO
+            filename=logs_path.joinpath("gucken.log"), encoding="utf-8", level=logging.INFO, force=True
         )
 
     register_atexit(gucken_settings_manager.save)

diff --git a/src/gucken/hoster/doodstream.py b/src/gucken/hoster/doodstream.py
@@ -3,12 +3,11 @@
 from re import compile as re_compile
 from string import ascii_letters, digits
 from time import time
-from urllib.parse import urlparse
 
 from ..networking import AsyncClient
 from .common import DirectLink, Hoster
 
-EXTRACT_DOODSTREAM_HLS_PATTERN = re_compile(r"/pass_md5/[\w-]+/[\w-]+")
+EXTRACT_DOODSTREAM_HLS_PATTERN = re_compile(r"/pass_md5/[\w-]+/(?P<token>[\w-]+)")
 
 
 def random_str(length: int = 10) -> str:
@@ -19,30 +18,18 @@ def js_date_now() -> int:
     return int(time() * 1000)
 
 
-headers = {"Referer": "https://d0000d.com/"}
-
-
 @dataclass
 class DoodstreamHoster(Hoster):
     requires_headers = True
 
     async def get_direct_link(self) -> DirectLink:
-        async with AsyncClient(verify=False) as client:
-            response = await client.head(self.url)
-            if response.has_redirect_location:
-                u2 = (
-                    urlparse(response.headers.get("Location"))
-                    ._replace(netloc="d000d.com")
-                    .geturl()
-                )
-                response = await client.get(u2)
-
-            pass_md5 = EXTRACT_DOODSTREAM_HLS_PATTERN.search(response.text)
-            response = await client.get(
-                f"https://d0000d.com{pass_md5.group()}",
-                headers={"Referer": "https://d0000d.com/"},
-            )
+        async with AsyncClient(verify=False, auto_referer=True) as client:
+            response1 = await client.get(self.url)
+            match = EXTRACT_DOODSTREAM_HLS_PATTERN.search(response1.text)
+
+            # Require Referer
+            response2 = await client.get(str(response1.url.copy_with(path=match.group())))
             return DirectLink(
-                url=f"{response.text}{random_str()}?token={pass_md5.group().split('/')[-1]}&expiry={js_date_now()}",
-                headers=headers,
+                url=f"{response2.text}{random_str()}?token={match.group("token")}&expiry={js_date_now()}",
+                headers={"Referer": str(response2.url.copy_with(path="/"))},
             )
diff --git a/src/gucken/hoster/streamtape.py b/src/gucken/hoster/streamtape.py
@@ -3,27 +3,18 @@
 from ..networking import AsyncClient
 from .common import DirectLink, Hoster
 
-STREAMTAPE_PATTERN = re_compile(r"botlink(.*?)innerHTML(.*?)\);")
-STREAMTAPE_PATTERN_SUBSTRING = re_compile(r"substring\(\d+")
-STREAMTAPE_PATTERN_DIGETS = re_compile(r"\d+")
+STREAMTAPE_PATTERN = re_compile(r"'botlink.*innerHTML.*?'(?P<s1>.*)'.*?\+.*?'(?P<s2>.*)'")
 
 
 class StreamtapeHoster(Hoster):
     async def get_direct_link(self) -> DirectLink:
         # TODO: Error checking
         async with AsyncClient(verify=False) as client:
             response = await client.get(self.url)
+
             # TODO: Save html and error in order to investigate
             # with open("out.txt", "wb") as f:
             #    f.write(response.text.encode('utf-8'))
-            video_src = STREAMTAPE_PATTERN.search(response.text)
-            j1 = "".join(video_src.groups())
-            u1 = j1.split(" ")[2][1:-2]
-            u2 = j1[j1.index("('") + 2 : j1.rfind("')")]
-
-            matches = STREAMTAPE_PATTERN_SUBSTRING.findall(j1)
-            for match in matches:
-                sub = STREAMTAPE_PATTERN_DIGETS.search(match).group(0)
-                u2 = u2[int(sub) :]
 
-            return DirectLink(f"https:{u1}{u2}")
+            match = STREAMTAPE_PATTERN.search(response.text)
+            return DirectLink(f"https:{match.group("s1")}{match.group('s2')[4:]}")
diff --git a/src/gucken/hoster/veo.py b/src/gucken/hoster/veo.py
@@ -4,13 +4,13 @@
 from ..networking import AsyncClient
 from .common import DirectLink, Hoster
 
-EXTRACT_VEO_HLS_PATTERN = re_compile(r"'hls': '(.*?)'")
+EXTRACT_VEO_HLS_PATTERN = re_compile(r"'hls': '(?P<hls>.*)'")
 
 
 class VOEHoster(Hoster):
     async def get_direct_link(self) -> DirectLink:
         async with AsyncClient(verify=False) as client:
             response = await client.get(self.url)
-            match_hls = EXTRACT_VEO_HLS_PATTERN.search(response.text)
-            hls_link = match_hls.group(1)
-            return DirectLink(b64decode(hls_link).decode())
+            match = EXTRACT_VEO_HLS_PATTERN.search(response.text)
+            link = match.group("hls")
+            return DirectLink(b64decode(link).decode())
diff --git a/src/gucken/hoster/vidoza.py b/src/gucken/hoster/vidoza.py
@@ -4,15 +4,14 @@
 
 from .common import DirectLink, Hoster
 
-# TODO: improve all patterns
 EXTRACT_VIDOZA_HLS_PATTERN = re_compile(
-    r"sourcesCode:.*?\[.*?\{.*?src:.*?[\'|\"](?P<hls>.*?)[\'|\"],"
+    r"sourcesCode:.*?\[.*?\{.*?src:.*?[\'|\"](?P<mp4>.*?)[\'|\"],"
 )
 
 
 class VidozaHoster(Hoster):
     async def get_direct_link(self) -> DirectLink:
         async with AsyncClient(verify=False) as client:
             response = await client.get(self.url)
-            match_hls = EXTRACT_VIDOZA_HLS_PATTERN.search(response.text)
-            return DirectLink(match_hls.group(1))
+            match = EXTRACT_VIDOZA_HLS_PATTERN.search(response.text)
+            return DirectLink(match.group("mp4"))