From 70099968a1029d4844b8bfde48ecfca2833fbd0b Mon Sep 17 00:00:00 2001 From: Mani Mozaffar Date: Sun, 16 Jul 2023 16:35:36 +0300 Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Redesigned=20Boilerplate?= =?UTF-8?q?=20(#39)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ♻️ refactor design * 🗑️ Deprecated selectolax * ♻️ improved contracts and base design * ✅ add rocketry serve test * 🦺 Update AioHttpEngine `cookies` and `proxy` type * ✅ Add engine test for attrs * 🎨 Update pre-commit and `pyproject.toml` and `setup.cfg` * 🎨 Apply format with conf on all project * 🔧 update config files * 🚨 fix mypy type errors * 🎨 improve core adaptor * ✅ Update tests * 👷 Update req-dev * 👷 Update tox.ini * 🎨 Improved and centralized protos * ⚡️ Update `_get_cookie` and add a test for non-setuped engine --------- Co-authored-by: Sadegh Yazdani --- .pre-commit-config.yaml | 7 +- README.md | 2 +- docs_src/initilizing_project/sample1/main.py | 2 +- .../initilizing_project/sample1/wikipedia.py | 11 +- fastcrawler/__init__.py | 9 +- fastcrawler/core/__init__.py | 2 +- fastcrawler/core/registery.py | 1 + fastcrawler/core/spider.py | 30 ++--- fastcrawler/engine/__init__.py | 6 +- fastcrawler/engine/aio.py | 96 ++++++++++----- fastcrawler/engine/constants.py | 0 fastcrawler/engine/{base.py => contracts.py} | 26 ++-- fastcrawler/engine/playwright.py | 112 ++++++++++++++++++ fastcrawler/parsers/__init__.py | 3 +- fastcrawler/parsers/{base.py => contracts.py} | 0 fastcrawler/parsers/html.py | 15 ++- fastcrawler/parsers/json.py | 26 ++-- .../processors/{base.py => contracts.py} | 15 ++- fastcrawler/parsers/processors/lxml.py | 6 +- fastcrawler/parsers/processors/modest.py | 65 +++++----- .../parsers/{pydantic.py => schema.py} | 0 fastcrawler/parsers/selectors/base.py | 39 +++--- fastcrawler/parsers/selectors/contracts.py | 33 ++++++ fastcrawler/parsers/selectors/css.py | 17 +-- fastcrawler/parsers/selectors/regex.py | 40 +++---- fastcrawler/parsers/selectors/xpath.py | 17 +-- fastcrawler/parsers/utils.py | 2 +- fastcrawler/schedule/__init__.py | 6 + fastcrawler/schedule/adopter.py | 8 +- .../schedule/{proto.py => contracts.py} | 2 + fastcrawler/schedule/schema.py | 55 ++++++++- fastcrawler/schedule/utilties.py | 53 --------- fastcrawler/utils/__init__.py | 2 +- fastcrawler/utils/injection.py | 32 ++--- pyproject.toml | 16 +-- requirements-dev.txt | 4 +- requirements/production.txt | 1 - setup.cfg | 17 +-- test/.coveragerc | 5 + test/conftest.py | 4 +- test/pytest.ini | 2 +- test/shared/engine.py | 8 +- test/shared/fastapi/uvicorn.log.yaml | 2 +- test/shared/mock_html.py | 1 + test/shared/mock_json.py | 20 +--- test/shared/schema.py | 10 +- test/test_engine.py | 74 ++++++++---- test/test_parser.py | 6 +- test/test_registery.py | 21 ++-- test/test_schedule.py | 34 +++--- tox.ini | 2 +- 51 files changed, 624 insertions(+), 343 deletions(-) delete mode 100644 fastcrawler/engine/constants.py rename fastcrawler/engine/{base.py => contracts.py} (74%) create mode 100644 fastcrawler/engine/playwright.py rename fastcrawler/parsers/{base.py => contracts.py} (100%) rename fastcrawler/parsers/processors/{base.py => contracts.py} (78%) rename fastcrawler/parsers/{pydantic.py => schema.py} (100%) create mode 100644 fastcrawler/parsers/selectors/contracts.py rename fastcrawler/schedule/{proto.py => contracts.py} (98%) delete mode 100644 fastcrawler/schedule/utilties.py create mode 100644 test/.coveragerc diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4d3be66..1d8ff31 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ repos: - id: check-yaml - id: check-added-large-files - repo: 'https://github.com/psf/black' - rev: 23.3.0 + rev: 23.7.0 hooks: - id: black - repo: 'https://github.com/PyCQA/flake8' @@ -18,3 +18,8 @@ repos: rev: v1.4.1 hooks: - id: mypy + name: mypy (fastcrawler) + files: ^fastcrawler/ + # - id: mypy + # name: mypy (test) + # files: ^test/ diff --git a/README.md b/README.md index 9031c60..b5911cd 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -# fastcrawler \ No newline at end of file +# fastcrawler diff --git a/docs_src/initilizing_project/sample1/main.py b/docs_src/initilizing_project/sample1/main.py index c042f65..5afdf88 100644 --- a/docs_src/initilizing_project/sample1/main.py +++ b/docs_src/initilizing_project/sample1/main.py @@ -2,5 +2,5 @@ from fastcrawler import FastCrawler app = FastCrawler( - crawlers=wiki_spider + crawlers=wiki_spider, ) diff --git a/docs_src/initilizing_project/sample1/wikipedia.py b/docs_src/initilizing_project/sample1/wikipedia.py index 4ec118a..e67162a 100644 --- a/docs_src/initilizing_project/sample1/wikipedia.py +++ b/docs_src/initilizing_project/sample1/wikipedia.py @@ -2,7 +2,7 @@ from fastcrawler import BaseModel, Crawler, CSSField, Spider, XPATHField -from fastcrawler.engine import AioHTTP +from fastcrawler.engine import AioHttpEngine class PageResolver(BaseModel): @@ -16,21 +16,24 @@ class ArticleData(BaseModel): class WikiBaseSpider(Spider): - engine = AioHTTP + engine = AioHttpEngine concurrency = 100 class WikiArticleFinder(WikiBaseSpider): data_model = PageResolver req_count = 1_000_000 - start_url = ["https://meta.wikimedia.org/wiki/List_of_Wikipedias", ] + start_url = [ + "https://meta.wikimedia.org/wiki/List_of_Wikipedias", + ] class WikiArticleRetirever(WikiBaseSpider): data_model = ArticleData req_count = 1_000_000 - async def save_data(self, data: ArticleData): ... # save parsed data to database + async def save_data(self, data: ArticleData): + ... # save parsed data to database wiki_spider = Crawler(WikiArticleFinder >> WikiArticleRetirever) diff --git a/fastcrawler/__init__.py b/fastcrawler/__init__.py index 2c1b92b..2cad067 100644 --- a/fastcrawler/__init__.py +++ b/fastcrawler/__init__.py @@ -1,5 +1,7 @@ from .core import Crawler, FastCrawler, Spider -from .parsers import BaseModel, CSSField, XPATHField, RegexField +from .engine import AioHttpEngine +from .parsers import BaseModel, CSSField, RegexField, XPATHField +from .schedule import RocketryApplication, RocketryController from .utils import Depends __all__ = [ @@ -10,5 +12,8 @@ "Depends", "Spider", "Crawler", - "FastCrawler" + "FastCrawler", + "RocketryApplication", + "RocketryController", + "AioHttpEngine", ] diff --git a/fastcrawler/core/__init__.py b/fastcrawler/core/__init__.py index 74ba334..c23dcfb 100644 --- a/fastcrawler/core/__init__.py +++ b/fastcrawler/core/__init__.py @@ -5,5 +5,5 @@ __all__ = [ "Crawler", "Spider", - "FastCrawler" + "FastCrawler", ] diff --git a/fastcrawler/core/registery.py b/fastcrawler/core/registery.py index e2438b6..bb63c93 100644 --- a/fastcrawler/core/registery.py +++ b/fastcrawler/core/registery.py @@ -9,6 +9,7 @@ class CrawlerMeta(type): DONT TOUCH THIS CLASS UNLESS YOU KNOW WHAT YOU ARE DOING. """ + def __init__(cls, name, bases, dct): super().__init__(name, bases, dct) cls._instances = {} diff --git a/fastcrawler/core/spider.py b/fastcrawler/core/spider.py index 81b197a..290c055 100644 --- a/fastcrawler/core/spider.py +++ b/fastcrawler/core/spider.py @@ -1,11 +1,25 @@ from typing import List -class SpiderMetaClass(type): +class Spider: + """ + Spider class to create the actual spider interface + so that configuration of each spider can be given + as class properties from the inheritanced class from spider + + instances property hold the instances that were set by metaclass + that is connected to current spider class + """ + + instances: List["Spider"] + + def __init__(self): + ... + def __rshift__(self, other: "Spider") -> "Spider": """ leveraged RSHIFT method for magic in flow >> - objA >> objB >> objC >> objD + clsA >> clsB >> clsC >> clsD Must be used as metaclass to inject behaviour to subclass @@ -17,15 +31,3 @@ def __rshift__(self, other: "Spider") -> "Spider": self.instances.append(other) setattr(other, "instances", self.instances) return other - - -class Spider(metaclass=SpiderMetaClass): - """ - Spider class to create the actual spider interface - so that configuration of each spider can be given - as class properties from the inheritanced class from spider - - instances property hold the instances that were set by metaclass - that is connected to current spider class - """ - instances: List["Spider"] diff --git a/fastcrawler/engine/__init__.py b/fastcrawler/engine/__init__.py index 5648a3c..259f4d1 100644 --- a/fastcrawler/engine/__init__.py +++ b/fastcrawler/engine/__init__.py @@ -1,8 +1,8 @@ -from .aio import AioHTTP -from .base import ProxySetting, SetCookieParam +from .aio import AioHttpEngine +from .contracts import ProxySetting, SetCookieParam __all__ = [ "ProxySetting", "SetCookieParam", - "AioHTTP", + "AioHttpEngine", ] diff --git a/fastcrawler/engine/aio.py b/fastcrawler/engine/aio.py index be28a82..458140c 100644 --- a/fastcrawler/engine/aio.py +++ b/fastcrawler/engine/aio.py @@ -1,13 +1,15 @@ import asyncio +from typing import Any import pydantic from aiohttp import BasicAuth, ClientSession, TCPConnector +from aiohttp.client import ClientResponse from aiohttp.cookiejar import Morsel -from fastcrawler.engine.base import ProxySetting, SetCookieParam +from fastcrawler.engine.contracts import ProxySetting, Response, SetCookieParam -class AioHTTP: +class AioHttpEngine: def __init__( self, cookies: list[SetCookieParam] | None = None, @@ -17,9 +19,9 @@ def __init__( connection_limit: int = 100, ): """Initialize a new engine instance with given cookie, header, useragent, and proxy""" - self.session = None + self.session: None | ClientSession = None self._cookies = ( - [(cookie.name, self.get_morsel_cookie(cookie)) for cookie in cookies] + [(cookie.name, self._get_morsel_cookie(cookie)) for cookie in cookies] if cookies is not None else None ) @@ -30,29 +32,39 @@ def __init__( self._connector = TCPConnector(limit_per_host=connection_limit) - self._proxy = {} + self._proxy: dict[Any, Any] = {} + self.proxy_dct = proxy if proxy: proxy_url = f"{proxy.protocol}{proxy.server}:{proxy.port}" self._proxy["proxy"] = proxy_url if proxy.username and proxy.password: - auth = BasicAuth(login=proxy.username, password=proxy.password) - self._proxy["proxy_auth"] = auth + self._proxy["proxy_auth"] = BasicAuth( + login=proxy.username, password=proxy.password + ) @property - def cookies(self): - return self._cookies + def cookies(self) -> list[SetCookieParam] | None: + """Return cookies""" + cookies = None + if self._cookies is not None: + cookies = [self._get_cookie(cookie) for _, cookie in self._cookies] + + return cookies @property - def headers(self): + def headers(self) -> dict: + """Return headers""" return self._headers @property - def proxy(self): - return self._proxy + def proxy(self) -> ProxySetting | None: + """Return proxy setting""" + return self.proxy_dct - def get_morsel_cookie(self, cookie: SetCookieParam) -> Morsel: + @staticmethod + def _get_morsel_cookie(cookie: SetCookieParam) -> Morsel: """Converts a SetCookieParam object to an Morsel object.""" - morsel_obj = Morsel() + morsel_obj: Morsel = Morsel() morsel_obj.set(cookie.name, cookie.value, cookie.value) morsel_obj.update( dict( @@ -66,6 +78,21 @@ def get_morsel_cookie(self, cookie: SetCookieParam) -> Morsel: ) return morsel_obj + @staticmethod + def _get_cookie(cookie: Morsel) -> SetCookieParam: + """convert Morsel object to SetCookieParam object""" + cookie_params = { + "name": cookie.key, + "value": cookie.value, + "domain": cookie.get("domain"), + "path": cookie.get("path"), + "expires": cookie.get("expires"), + "httpOnly": cookie.get("httponly"), + "secure": cookie.get("secure"), + "sameSite": cookie.get("samesite"), + } + return SetCookieParam(**cookie_params) + async def __aenter__(self): """Async context manager support for engine -> ENTER""" await self.setup() @@ -79,7 +106,7 @@ async def setup(self, **kwargs) -> None: """Set-up up the engine for crawling purpose.""" self.session = ClientSession( connector=self._connector, - cookies=self.cookies, + cookies=self._cookies, headers=self.headers, trust_env=True, **kwargs, @@ -87,38 +114,51 @@ async def setup(self, **kwargs) -> None: async def teardown(self) -> None: """Cleans up the engine.""" - await self.session.close() + if self.session: + await self.session.close() - async def base(self, url: pydantic.AnyUrl, method: str, data: dict, **kwargs) -> str: + async def base( + self, url: pydantic.AnyUrl, method: str, data: dict | None, **kwargs + ) -> Response | None: """Base Method for protocol to retrieve a list of URL.""" - - async with self.session.request( - method, url, data=data, headers=self.headers, **self.proxy, **kwargs - ) as response: - return await response.text() - - async def get(self, urls: list[pydantic.AnyUrl], **kwargs) -> list[str] | str: + if self.session: + async with self.session.request( + method, str(url), data=data, headers=self.headers, **self._proxy, **kwargs + ) as response: + return await self.translate_to_response(response) + return None + + async def get(self, urls: list[pydantic.AnyUrl], **kwargs) -> list[Response]: """GET HTTP Method for protocol to retrieve a list of URL.""" tasks = [self.base(url, "GET", None, **kwargs) for url in urls] return await asyncio.gather(*tasks) async def post( self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs - ) -> list[str] | str: + ) -> list[Response]: """POST HTTP Method for protocol to crawl a list of URL.""" tasks = [self.base(url, "POST", data=data, **kwargs) for url, data in zip(urls, datas)] return await asyncio.gather(*tasks) async def put( self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs - ) -> list[str] | str: + ) -> list[Response]: """PUT HTTP Method for protocol to crawl a list of URL.""" - tasks = [self.base(url, "PUT", data=data) for url, data in zip(urls, datas)] + tasks = [self.base(url, "PUT", data=data, **kwargs) for url, data in zip(urls, datas)] return await asyncio.gather(*tasks) async def delete( self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs - ) -> list[str] | str: + ) -> list[Response]: """DELETE HTTP Method for protocol to crawl a list of URL.""" tasks = [self.base(url, "DELETE", data=data, **kwargs) for url, data in zip(urls, datas)] return await asyncio.gather(*tasks) + + async def translate_to_response(self, response_obj: ClientResponse) -> Response: + """Translate aiohttp response object to Response object""" + return Response( + text=await response_obj.text(), + status_code=response_obj.status, + headers=response_obj.headers, + cookie=response_obj.cookies, + ) diff --git a/fastcrawler/engine/constants.py b/fastcrawler/engine/constants.py deleted file mode 100644 index e69de29..0000000 diff --git a/fastcrawler/engine/base.py b/fastcrawler/engine/contracts.py similarity index 74% rename from fastcrawler/engine/base.py rename to fastcrawler/engine/contracts.py index 0b198bc..5f24c4b 100644 --- a/fastcrawler/engine/base.py +++ b/fastcrawler/engine/contracts.py @@ -7,15 +7,15 @@ class SetCookieParam(pydantic.BaseModel): - name: str - value: str + name: str = "" + value: str = "" url: str | None = None domain: str = "" - path: str | None = None - expires: float | None = None - httpOnly: bool | None = None - secure: bool | None = None - sameSite: Literal["Lax", "None", "Strict"] | None = None + path: str = "" + expires: str = "" + httpOnly: str = "" + secure: str = "" + sameSite: str | Literal["Lax", "None", "Strict"] = "" class ProxySetting(pydantic.BaseModel): @@ -26,6 +26,13 @@ class ProxySetting(pydantic.BaseModel): password: str | None = None +class Response(pydantic.BaseModel): + text: str | None = None + status_code: int | None = None + headers: dict | None = None + cookie: dict | None = None + + class EngineProto(Protocol): def __init__( self, @@ -34,7 +41,7 @@ def __init__( useragent: str | None, proxy: ProxySetting | None, ): - """Initialize a new engine instance with given cookie(s), header(s), useragent, and proxy""" + "Initialize a new engine instance with given cookie(s), header(s), useragent, and proxy" async def __aenter__(self): """Async context manager support for engine -> ENTER""" @@ -62,3 +69,6 @@ async def put(self, urls: list[pydantic.AnyUrl], datas: list[dict]) -> str: async def delete(self, urls: list[pydantic.AnyUrl], datas: list[dict]) -> str: """DELETE HTTP Method for protocol to crawl a list of URL.""" + + async def translate_to_response(self, response_obj: type) -> Response: + """Translate the response object to a Response object""" diff --git a/fastcrawler/engine/playwright.py b/fastcrawler/engine/playwright.py new file mode 100644 index 0000000..cf0e803 --- /dev/null +++ b/fastcrawler/engine/playwright.py @@ -0,0 +1,112 @@ +# pragma: no cover +# noqa + +import pydantic + +from fastcrawler.engine.contracts import ProxySetting, SetCookieParam + + +class PlayWrightEngine: + def __init__( + self, + cookies: list[SetCookieParam] | None = None, + headers: dict | None = None, + useragent: str | None = None, + proxy: ProxySetting | None = None, + connection_limit: int = 100, + ): + """Initialize a new engine instance with given cookie, header, useragent, and proxy""" + raise NotImplementedError( + "Playwright engine is incldued as another lib, pip install fastcrawler[playwright]" + "\nfrom fastcrawler_playwright import PlayWrightEngine" + ) + + @property + def cookies(self): + raise NotImplementedError( + "Playwright engine is incldued as another lib, pip install fastcrawler[playwright]" + "\nfrom fastcrawler_playwright import PlayWrightEngine" + ) + + @property + def headers(self): + raise NotImplementedError( + "Playwright engine is incldued as another lib, pip install fastcrawler[playwright]" + "\nfrom fastcrawler_playwright import PlayWrightEngine" + ) + + @property + def proxy(self): + raise NotImplementedError( + "Playwright engine is incldued as another lib, pip install fastcrawler[playwright]" + "\nfrom fastcrawler_playwright import PlayWrightEngine" + ) + + async def __aenter__(self): + """Async context manager support for engine -> ENTER""" + raise NotImplementedError( + "Playwright engine is incldued as another lib, pip install fastcrawler[playwright]" + "\nfrom fastcrawler_playwright import PlayWrightEngine" + ) + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager support for engine -> EXIT""" + raise NotImplementedError( + "Playwright engine is incldued as another lib, pip install fastcrawler[playwright]" + "\nfrom fastcrawler_playwright import PlayWrightEngine" + ) + + async def setup(self, **kwargs) -> None: + """Set-up up the engine for crawling purpose.""" + raise NotImplementedError( + "Playwright engine is incldued as another lib, pip install fastcrawler[playwright]" + "\nfrom fastcrawler_playwright import PlayWrightEngine" + ) + + async def teardown(self) -> None: + """Cleans up the engine.""" + raise NotImplementedError( + "Playwright engine is incldued as another lib, pip install fastcrawler[playwright]" + "\nfrom fastcrawler_playwright import PlayWrightEngine" + ) + + async def base(self, url: pydantic.AnyUrl, method: str, data: dict, **kwargs) -> str: + """Base Method for protocol to retrieve a list of URL.""" + raise NotImplementedError( + "Playwright engine is incldued as another lib, pip install fastcrawler[playwright]" + "\nfrom fastcrawler_playwright import PlayWrightEngine" + ) + + async def get(self, urls: list[pydantic.AnyUrl], **kwargs) -> list[str] | str: + """GET HTTP Method for protocol to retrieve a list of URL.""" + raise NotImplementedError( + "Playwright engine is incldued as another lib, pip install fastcrawler[playwright]" + "\nfrom fastcrawler_playwright import PlayWrightEngine" + ) + + async def post( + self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs + ) -> list[str] | str: + """POST HTTP Method for protocol to crawl a list of URL.""" + raise NotImplementedError( + "Playwright engine is incldued as another lib, pip install fastcrawler[playwright]" + "\nfrom fastcrawler_playwright import PlayWrightEngine" + ) + + async def put( + self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs + ) -> list[str] | str: + """PUT HTTP Method for protocol to crawl a list of URL.""" + raise NotImplementedError( + "Playwright engine is incldued as another lib, pip install fastcrawler[playwright]" + "\nfrom fastcrawler_playwright import PlayWrightEngine" + ) + + async def delete( + self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs + ) -> list[str] | str: + """DELETE HTTP Method for protocol to crawl a list of URL.""" + raise NotImplementedError( + "Playwright engine is incldued as another lib, pip install fastcrawler[playwright]" + "\nfrom fastcrawler_playwright import PlayWrightEngine" + ) diff --git a/fastcrawler/parsers/__init__.py b/fastcrawler/parsers/__init__.py index ccf2e73..0861069 100644 --- a/fastcrawler/parsers/__init__.py +++ b/fastcrawler/parsers/__init__.py @@ -3,7 +3,7 @@ from .processors.lxml import LxmlProcessor # from .processors.modest import ModestProcessor -from .pydantic import BaseModel +from .schema import BaseModel from .selectors.css import CSSField from .selectors.regex import RegexField from .selectors.xpath import XPATHField @@ -18,6 +18,5 @@ "JsonParser", "HTMLParser", # Processors - "ModestProcessor", "LxmlProcessor", ] diff --git a/fastcrawler/parsers/base.py b/fastcrawler/parsers/contracts.py similarity index 100% rename from fastcrawler/parsers/base.py rename to fastcrawler/parsers/contracts.py diff --git a/fastcrawler/parsers/html.py b/fastcrawler/parsers/html.py index eedc3d0..16af037 100644 --- a/fastcrawler/parsers/html.py +++ b/fastcrawler/parsers/html.py @@ -5,7 +5,7 @@ from fastcrawler.exceptions import ParserInvalidModelType, ParserValidationError -from .pydantic import BaseModel, BaseModelType, URLs +from .schema import BaseModel, BaseModelType, URLs from .selectors.base import BaseSelector, get_selector from .utils import get_inner_model @@ -26,6 +26,14 @@ class HTMLParser: html_parser.parse(a pydantic model built with XPATHField or CSSField) """ + @property + def data(self): + return getattr(self, "_data", None) + + @data.setter + def data(self, value): + self._data = value + def __init__(self, scraped_data: str): """ Initiate the HTML file in memory, so it can be parsed later @@ -33,13 +41,12 @@ def __init__(self, scraped_data: str): """ self.scraped_data = scraped_data self.resolver: URLs | None = None - self.data = None def parse(self, model: Type[BaseModelType]) -> BaseModelType: """ Parse using the pydantic model """ - if issubclass(model, BaseModel): # type: ignore + if issubclass(model, BaseModel): data = {} for field_name, field in model.model_fields.items(): fastcrawler_selector = get_selector(field) @@ -55,7 +62,7 @@ def parse(self, model: Type[BaseModelType]) -> BaseModelType: model.Config, "url_resolver", ) and issubclass(model.Config.url_resolver.__class__, BaseSelector): - urls: list[Url] = model.Config.url_resolver.resolve( # type: ignore + urls: list[Url] = model.Config.url_resolver.resolve( self.scraped_data, model=None, ) diff --git a/fastcrawler/parsers/json.py b/fastcrawler/parsers/json.py index db1a957..812a74e 100644 --- a/fastcrawler/parsers/json.py +++ b/fastcrawler/parsers/json.py @@ -4,7 +4,7 @@ from pydantic_core import Url from fastcrawler.exceptions import ParserInvalidModelType, ParserValidationError -from fastcrawler.parsers.pydantic import BaseModel, BaseModelType, URLs +from fastcrawler.parsers.schema import BaseModel, BaseModelType, URLs class JsonParser: @@ -23,7 +23,13 @@ class JsonParser: html_parser.parse(a pydantic model built with XPATHField or CSSField) """ - data = None + @property + def data(self): + return getattr(self, "_data", None) + + @data.setter + def data(self, value): + self._data = value def __init__(self, scraped_data: dict): """ @@ -37,19 +43,23 @@ def parse(self, model: Type[BaseModelType]) -> BaseModelType: """ Parse using the pydantic model """ - if hasattr(model, "__mro__") and BaseModel in model.__mro__: # type: ignore + if hasattr(model, "__mro__") and BaseModel in model.__mro__: self.data: BaseModelType | Any = {} for field_name, field in model.model_fields.items(): self.data[field_name] = self.scraped_data.get(field_name) or field.default - if hasattr(model.Config, "url_resolver") and isinstance(model.Config.url_resolver, str): - current_address: dict = self.scraped_data.copy() - for address in model.Config.url_resolver.split("."): - current_address = current_address.get(address) # type: ignore + if hasattr(model.Config, "url_resolver") and isinstance( + model.Config.url_resolver, str + ): + current_address: Any | dict = self.scraped_data.copy() + for adrs in model.Config.url_resolver.split("."): + # Keep looping, w.t.r dots, (like key.key) to get page value + current_address = current_address.get(adrs) + self.resolver = URLs( urls=[ - Url(current_address), + Url(current_address), # type: ignore ] ) try: diff --git a/fastcrawler/parsers/processors/base.py b/fastcrawler/parsers/processors/contracts.py similarity index 78% rename from fastcrawler/parsers/processors/base.py rename to fastcrawler/parsers/processors/contracts.py index 2fdd659..cd09f6e 100644 --- a/fastcrawler/parsers/processors/base.py +++ b/fastcrawler/parsers/processors/contracts.py @@ -1,7 +1,10 @@ +# pragma: no cover +# noqa + from typing import Protocol -class ElementInterface(Protocol): +class ElementProtocol(Protocol): def get(self, key: str, default=None): """ get method, which resolves an HTML element from a given key @@ -23,11 +26,11 @@ def text(self): """ -class ProcessorInterface(Protocol): - base_element: ElementInterface = ... +class ProcessorProcotol(Protocol): + base_element: ElementProtocol @staticmethod - def to_string(result: ElementInterface) -> str: + def to_string(result: ElementProtocol) -> str: """ Resolves a result to string, by getting the inner html, This method is used to iterate over HTML elements to resolve inner pydantic models @@ -37,7 +40,7 @@ def to_string(result: ElementInterface) -> str: def from_string_by_xpath( string: str, query: str, - ) -> list[ElementInterface] | ElementInterface | None: + ) -> list[ElementProtocol] | ElementProtocol | None: """ Resolves a HTML string by XPATH """ @@ -46,7 +49,7 @@ def from_string_by_xpath( def from_string_by_css( string: str, query: str, - ) -> list[ElementInterface] | ElementInterface | None: + ) -> list[ElementProtocol] | ElementProtocol | None: """ Resolves a HTML string by CSS """ diff --git a/fastcrawler/parsers/processors/lxml.py b/fastcrawler/parsers/processors/lxml.py index 29079cd..9ac0518 100644 --- a/fastcrawler/parsers/processors/lxml.py +++ b/fastcrawler/parsers/processors/lxml.py @@ -13,7 +13,7 @@ def to_string(result: etree.ElementBase) -> str: Resolves a result to string, by getting the inner html, This method is used to iterate over HTML elements to resolve inner pydantic models """ - return lxml_html.tostring(result) + return lxml_html.tostring(result) # type: ignore @staticmethod def from_string_by_xpath( @@ -24,7 +24,7 @@ def from_string_by_xpath( Resolves a HTML string by XPATH """ tree = lxml_html.fromstring(string) - results: list[etree.ElementBase] = tree.xpath(query) + results: list[etree.ElementBase] = tree.xpath(query) # type: ignore return results @staticmethod @@ -36,5 +36,5 @@ def from_string_by_css( Resolves a HTML string by CSS """ tree = lxml_html.fromstring(string) - results: list[etree.ElementBase] = tree.cssselect(query) + results: list[etree.ElementBase] = tree.cssselect(query) # type: ignore return results diff --git a/fastcrawler/parsers/processors/modest.py b/fastcrawler/parsers/processors/modest.py index bd283fe..4b11e96 100644 --- a/fastcrawler/parsers/processors/modest.py +++ b/fastcrawler/parsers/processors/modest.py @@ -1,37 +1,34 @@ # NOTE: This library is not matured yet to be used -# from selectolax.parser import HTMLParser, Node - -# from .base import ElementInterface - - -# class ModestProcessor: -# base_element = Node - -# @staticmethod -# def to_string(result: Node) -> str: -# """ -# Resolves a result to string, by getting the inner html, -# This method is used to iterate over HTML elements to resolve inner pydantic models -# """ -# return result.html - -# @staticmethod -# def from_string_by_xpath( -# string: str, query: str -# ) -> list[ElementInterface] | ElementInterface | None: -# """ -# Resolves a HTML string by XPATH -# """ -# raise NotImplementedError("XPATH is not supported in selectolax") - -# @staticmethod -# def from_string_by_css( -# string: str, query: str -# ) -> list[ElementInterface] | ElementInterface | None: -# """ -# Resolves a HTML string by CSS -# """ -# results = HTMLParser(string).css(query) -# return results +class ModestProcessor: + @staticmethod + def to_string(result) -> str: + """ + Resolves a result to string, by getting the inner html, + This method is used to iterate over HTML elements to resolve inner pydantic models + """ + raise NotImplementedError( + "Selectolax processors is incldued as another lib, pip install fastcrawler[selecolax]" + "\nfrom fastcrawler_selectolax import ModestProcessor" + ) + + @staticmethod + def from_string_by_xpath(string: str, query: str): + """ + Resolves a HTML string by XPATH + """ + raise NotImplementedError( + "Selectolax processors is incldued as another lib, pip install fastcrawler[selecolax]" + "\nfrom fastcrawler_selectolax import ModestProcessor" + ) + + @staticmethod + def from_string_by_css(string: str, query: str): + """ + Resolves a HTML string by CSS + """ + raise NotImplementedError( + "Selectolax processors is incldued as another lib, pip install fastcrawler[selecolax]" + "\nfrom fastcrawler_selectolax import ModestProcessor" + ) diff --git a/fastcrawler/parsers/pydantic.py b/fastcrawler/parsers/schema.py similarity index 100% rename from fastcrawler/parsers/pydantic.py rename to fastcrawler/parsers/schema.py diff --git a/fastcrawler/parsers/selectors/base.py b/fastcrawler/parsers/selectors/base.py index 29e478c..d2ec86f 100644 --- a/fastcrawler/parsers/selectors/base.py +++ b/fastcrawler/parsers/selectors/base.py @@ -1,15 +1,13 @@ -# pylint: disable=c-extension-no-member - from typing import Any, Callable from pydantic.fields import FieldInfo from fastcrawler.exceptions import ProcessorNotSupported -from fastcrawler.parsers.base import ParserProtocol -from fastcrawler.parsers.pydantic import BaseModelType, MappedAttr, MappedResult +from fastcrawler.parsers.contracts import ParserProtocol +from fastcrawler.parsers.schema import BaseModelType, MappedAttr, MappedResult from fastcrawler.parsers.utils import _UNSET -from ..processors.base import ElementInterface, ProcessorInterface +from ..processors.contracts import ElementProtocol, ProcessorProcotol from ..processors.lxml import LxmlProcessor @@ -20,10 +18,10 @@ def __init__( self, query: str, parser: Callable[..., ParserProtocol] | None = None, - processor: ProcessorInterface | None = None, + processor: ProcessorProcotol | None = None, extract: str | None = None, many: bool = False, - model: Callable[..., BaseModelType] | None = None, + model: BaseModelType | list[BaseModelType | Any] | None = None, default: Any = _UNSET, ): self.query = query @@ -36,20 +34,24 @@ def __init__( def __repr__(self): """Represents a selector for debugging purposes""" - return f"Field(type={self.__class__.__name__} extract={self.extract}, many={self.many}, query={self.query})" + return ( + f"Field(type={self.__class__.__name__} extract={self.extract}," + f" many={self.many}, query={self.query})" + ) def resolve(self, scraped_data, model): """Must be implemented by outer classes. Resolves the selector spefinalized by 'XPATH' or 'CSS' or etc """ raise NotImplementedError( - "Resolves must be overwritten by subclass" f"scraped_data={scraped_data}, model={model}" + "Resolves must be overwritten by subclass" + f"scraped_data={scraped_data}, model={model}" ) def _process_results( self, - results: list[ElementInterface], - ) -> BaseModelType | list[BaseModelType | Any] | None: + results: list[ElementProtocol], + ) -> BaseModelType | list[BaseModelType | Any] | list[ElementProtocol] | None: """Process the results resolved based on the logic which is combination of many, and extract. """ @@ -57,7 +59,10 @@ def _process_results( if self.many: results = [(self.get_from_exctract(result)) for result in results] if self.model: - results = [self.parser(self.processor.to_string(el)).parse(self.model) for el in results] + results = [ + self.parser(self.processor.to_string(el)).parse(self.model) # type: ignore + for el in results # type: ignore + ] return results results = self.get_from_exctract(results[0]) @@ -94,7 +99,7 @@ def call_from_mapper(self, result, mapped: MappedAttr, *args, **kwargs): else getattr(result, mapped.attr_name) ) - def get_from_exctract(self, result: ElementInterface) -> Any: + def get_from_exctract(self, result: ElementProtocol) -> Any: """ Resolve the extract from string, to get text from etree.ElementBase or to get other attributes or the string of HTML by default @@ -113,9 +118,13 @@ def get_from_exctract(self, result: ElementInterface) -> Any: self.interface_mapper(result).get, self.extract, ) - elif not self.extract and not self.many and issubclass(type(result), self.processor.base_element): + elif ( + not self.extract + and not self.many + and issubclass(type(result), self.processor.base_element) # type: ignore + ): # Return: HTML string of object result - return self.processor.to_string(result) + return self.processor.to_string(result) # type: ignore else: # Return: inner HTML element objects to parse nested models return result diff --git a/fastcrawler/parsers/selectors/contracts.py b/fastcrawler/parsers/selectors/contracts.py new file mode 100644 index 0000000..76300f7 --- /dev/null +++ b/fastcrawler/parsers/selectors/contracts.py @@ -0,0 +1,33 @@ +# pragma: no cover +# pylint: disable=c-extension-no-member +from typing import Any, Callable, Protocol + +from fastcrawler.parsers.contracts import ParserProtocol +from fastcrawler.parsers.schema import BaseModelType +from fastcrawler.parsers.utils import _UNSET + +from ..processors.contracts import ProcessorProcotol + + +class SelectorProto(Protocol): + """Base class for HTML-based selectors that are dependent on lxml family.""" + + def __init__( + self, + query: str, + parser: Callable[..., ParserProtocol] | None = None, + processor: ProcessorProcotol | None = None, + extract: str | None = None, + many: bool = False, + model: Callable[..., BaseModelType] | None = None, + default: Any = _UNSET, + ): + """Initiate selector""" + + def __repr__(self): + """Represents a selector for debugging purposes""" + + def resolve(self, scraped_data, model): + """Must be implemented by outer classes. + Resolves the selector spefinalized by 'XPATH' or 'CSS' or etc + """ diff --git a/fastcrawler/parsers/selectors/css.py b/fastcrawler/parsers/selectors/css.py index ac6fba1..bdc20cf 100644 --- a/fastcrawler/parsers/selectors/css.py +++ b/fastcrawler/parsers/selectors/css.py @@ -1,12 +1,12 @@ # pylint: disable=c-extension-no-member -from typing import Any, Callable +from typing import Any from fastcrawler.parsers.html import HTMLParser -from fastcrawler.parsers.pydantic import BaseModelType +from fastcrawler.parsers.schema import BaseModelType from fastcrawler.parsers.utils import _UNSET -from ..processors.base import ProcessorInterface +from ..processors.contracts import ProcessorProcotol from .base import BaseSelector @@ -17,23 +17,24 @@ class _CSSField(BaseSelector): """ def resolve( - self, scraped_data: str, model: None | BaseModelType = None + self, scraped_data: str, model: BaseModelType | list[BaseModelType | Any] | None ) -> BaseModelType | list[BaseModelType | Any] | None: """Resolves HTML input using CSS selector""" self.model = model or self.model results = self.processor.from_string_by_css(scraped_data, self.query) if not results: return self.default - return self._process_results(results) + return self._process_results(results) # type: ignore +# pylint: disable=invalid-name def CSSField( query: str, - processor: None | ProcessorInterface = None, - parser: HTMLParser = HTMLParser, + processor: None | ProcessorProcotol = None, + parser=HTMLParser, extract: str | None = None, many: bool = False, - model: Callable[..., BaseModelType] | None = None, + model: BaseModelType | list[BaseModelType | Any] | None = None, default: Any = _UNSET, ) -> Any: """The reason that an object was initiated from class, and the class wasn't called directly diff --git a/fastcrawler/parsers/selectors/regex.py b/fastcrawler/parsers/selectors/regex.py index 1994f91..8395eed 100644 --- a/fastcrawler/parsers/selectors/regex.py +++ b/fastcrawler/parsers/selectors/regex.py @@ -1,11 +1,11 @@ -# pylint: disable=c-extension-no-member import re -from typing import Any, Callable, Literal +from typing import Any from fastcrawler.parsers.html import HTMLParser -from fastcrawler.parsers.pydantic import BaseModelType +from fastcrawler.parsers.schema import BaseModelType from fastcrawler.parsers.utils import _UNSET +from ..processors.contracts import ProcessorProcotol from .base import BaseSelector @@ -15,36 +15,25 @@ class _RegexField(BaseSelector): document using Regex. """ - def __init__( - self, - regex: Literal[""], - default: Any = _UNSET, - many: bool = False, - model: Callable[..., BaseModelType] | None = None, - ): - self.parser = HTMLParser - self.default = default - self.regex = re.compile(regex) - self.many = many - self.model = model - def resolve( - self, scraped_data: str, model: BaseModelType | None = None + self, scraped_data: str, model: BaseModelType | list[BaseModelType | Any] | None = None ) -> BaseModelType | list[BaseModelType | Any] | None | Any: """Resolves HTML input as the Regex value given to list""" self.model = model or self.model if self.many: - return re.findall(self.regex, scraped_data) + return re.findall(self.query, scraped_data) else: - result = re.search(self.regex, scraped_data) + result = re.search(self.query, scraped_data) return result.group(1) if result else None # pylint: disable=invalid-name def RegexField( - regex: Literal[r""], + query: str, + processor: None | ProcessorProcotol = None, + parser=HTMLParser, many: bool = False, - model: Callable[..., BaseModelType] | None = None, + model: BaseModelType | list[BaseModelType | Any] | None = None, default: Any = _UNSET, ) -> Any: """The reason that an object was initiated from class, and the class wasn't called directly @@ -52,4 +41,11 @@ def RegexField( and that's not what we want, we want to assign this to another type (ANY), so I should be using a function as interface to avoid IDE's error in type annotation or mypy. """ - return _RegexField(regex=regex, many=many, default=default, model=model) + return _RegexField( + query=query, + many=many, + model=model, + default=default, + parser=parser, + processor=processor, + ) diff --git a/fastcrawler/parsers/selectors/xpath.py b/fastcrawler/parsers/selectors/xpath.py index e40c491..9f931f4 100644 --- a/fastcrawler/parsers/selectors/xpath.py +++ b/fastcrawler/parsers/selectors/xpath.py @@ -1,12 +1,12 @@ # pylint: disable=c-extension-no-member -from typing import Any, Callable +from typing import Any from fastcrawler.parsers.html import HTMLParser -from fastcrawler.parsers.pydantic import BaseModelType +from fastcrawler.parsers.schema import BaseModelType from fastcrawler.parsers.utils import _UNSET -from ..processors.base import ProcessorInterface +from ..processors.contracts import ProcessorProcotol from .base import BaseSelector @@ -17,23 +17,24 @@ class _XPATHField(BaseSelector): """ def resolve( - self, scraped_data: str, model: BaseModelType | None = None + self, scraped_data: str, model: BaseModelType | list[BaseModelType | Any] | None ) -> BaseModelType | list[BaseModelType | Any] | None: """Resolves HTML input as the xpath value given to list""" self.model = model or self.model results = self.processor.from_string_by_xpath(scraped_data, self.query) if not results: return self.default - return self._process_results(results) + return self._process_results(results) # type: ignore +# pylint: disable=invalid-name def XPATHField( query: str, - processor: None | ProcessorInterface = None, - parser: HTMLParser = HTMLParser, + processor: None | ProcessorProcotol = None, + parser=HTMLParser, extract: str | None = None, many: bool = False, - model: Callable[..., BaseModelType] | None = None, + model: BaseModelType | list[BaseModelType | Any] | None = None, default: Any = _UNSET, ) -> Any: """The reason that an object was initiated from class, and the class wasn't called directly diff --git a/fastcrawler/parsers/utils.py b/fastcrawler/parsers/utils.py index 876dc8b..5be7c3b 100644 --- a/fastcrawler/parsers/utils.py +++ b/fastcrawler/parsers/utils.py @@ -1,6 +1,6 @@ from typing import Any, get_args -from .pydantic import BaseModelType +from .schema import BaseModelType def get_inner_model(model: list[BaseModelType] | Any, field_name: str) -> Any | BaseModelType: diff --git a/fastcrawler/schedule/__init__.py b/fastcrawler/schedule/__init__.py index e69de29..bedc64a 100644 --- a/fastcrawler/schedule/__init__.py +++ b/fastcrawler/schedule/__init__.py @@ -0,0 +1,6 @@ +from .adopter import RocketryApplication, RocketryController + +__all__ = [ + "RocketryApplication", + "RocketryController", +] diff --git a/fastcrawler/schedule/adopter.py b/fastcrawler/schedule/adopter.py index 1aa3852..a9506de 100644 --- a/fastcrawler/schedule/adopter.py +++ b/fastcrawler/schedule/adopter.py @@ -1,7 +1,7 @@ from typing import Callable -from rocketry import Rocketry -from rocketry.conditions.api import cron +from rocketry import Rocketry # type: ignore +from rocketry.conditions.api import cron # type: ignore from fastcrawler.exceptions import TaskNotFound @@ -37,7 +37,7 @@ async def shut_down(self) -> None: return None -class RocketryManager: +class RocketryController: def __init__(self, app: RocketryApplication): self.app = app @@ -72,7 +72,7 @@ async def change_task_schedule( if schedule.count(" ") == 4: task.start_cond = cron(schedule) else: - task.start_cond = schedule + task.start_cond = schedule # type: ignore return None raise TaskNotFound(task_name) diff --git a/fastcrawler/schedule/proto.py b/fastcrawler/schedule/contracts.py similarity index 98% rename from fastcrawler/schedule/proto.py rename to fastcrawler/schedule/contracts.py index 227dec5..4697174 100644 --- a/fastcrawler/schedule/proto.py +++ b/fastcrawler/schedule/contracts.py @@ -1,3 +1,5 @@ +# pragma: no cover + from typing import Callable, Protocol from .schema import Task diff --git a/fastcrawler/schedule/schema.py b/fastcrawler/schedule/schema.py index cd25853..376da02 100644 --- a/fastcrawler/schedule/schema.py +++ b/fastcrawler/schedule/schema.py @@ -2,8 +2,59 @@ from typing import Literal from pydantic import BaseModel, Field # pylint: disable=no-name-in-module +from rocketry.core import BaseCondition as _BaseCondition # type: ignore -from .utilties import BaseCondition + +class BaseCondition(_BaseCondition): # pylint: disable=abstract-method + """A condition is a thing/occurence that should happen in + order to something happen. + + Conditions are used to determine whether a task can be started, + a task should be terminated or the scheduler should shut + down. Conditions are either true or false. + + A condition could answer for any of the following questions: + - Current time is as specified (ie. Monday afternoon). + - A given task has already run. + - The machine has at least a given amount of RAM. + - A specific file exists. + + Each condition should have the method ``__bool__`` specified + as minimum. This method should return ``True`` or ``False`` + depending on whether the condition holds or does not hold. + + Examples + -------- + + Minimum example: + + >>> from rocketry.core import BaseCondition + >>> class MyCondition(BaseCondition): + ... def __bool__(self): + ... ... # Code that defines state either + ... return True + + Complicated example with parser: + + >>> import os, re + >>> class IsFooBar(BaseCondition): + ... __parsers__ = { + ... re.compile(r"is foo '(?P.+)'"): "__init__" + ... } + ... + ... def __init__(self, outcome): + ... self.outcome = outcome + ... + ... def __bool__(self): + ... return self.outcome == "bar" + ... + ... def __repr__(self): + ... return f"IsFooBar('{self.outcome}')" + ... + >>> from rocketry.parse import parse_condition + >>> parse_condition("is foo 'bar'") + IsFooBar('bar') + """ class Task(BaseModel): @@ -74,7 +125,7 @@ class Task(BaseModel): name: str | None = Field(description="Name of the task. Must be unique") description: str | None = Field(description="Description of the task for documentation") logger_name: str | None = Field( - description="Logger name to be used in logging the task record" + description="Logger name to be used in logging the task record", default=None ) execution: Literal["main", "async", "thread", "process"] | None = None priority: int = 0 diff --git a/fastcrawler/schedule/utilties.py b/fastcrawler/schedule/utilties.py deleted file mode 100644 index 00a7bee..0000000 --- a/fastcrawler/schedule/utilties.py +++ /dev/null @@ -1,53 +0,0 @@ -from rocketry.core import BaseCondition as _BaseCondition - - -class BaseCondition(_BaseCondition): # pylint: disable=abstract-method - """A condition is a thing/occurence that should happen in - order to something happen. - - Conditions are used to determine whether a task can be started, - a task should be terminated or the scheduler should shut - down. Conditions are either true or false. - - A condition could answer for any of the following questions: - - Current time is as specified (ie. Monday afternoon). - - A given task has already run. - - The machine has at least a given amount of RAM. - - A specific file exists. - - Each condition should have the method ``__bool__`` specified - as minimum. This method should return ``True`` or ``False`` - depending on whether the condition holds or does not hold. - - Examples - -------- - - Minimum example: - - >>> from rocketry.core import BaseCondition - >>> class MyCondition(BaseCondition): - ... def __bool__(self): - ... ... # Code that defines state either - ... return True - - Complicated example with parser: - - >>> import os, re - >>> class IsFooBar(BaseCondition): - ... __parsers__ = { - ... re.compile(r"is foo '(?P.+)'"): "__init__" - ... } - ... - ... def __init__(self, outcome): - ... self.outcome = outcome - ... - ... def __bool__(self): - ... return self.outcome == "bar" - ... - ... def __repr__(self): - ... return f"IsFooBar('{self.outcome}')" - ... - >>> from rocketry.parse import parse_condition - >>> parse_condition("is foo 'bar'") - IsFooBar('bar') - """ diff --git a/fastcrawler/utils/__init__.py b/fastcrawler/utils/__init__.py index 8540fd3..848a947 100644 --- a/fastcrawler/utils/__init__.py +++ b/fastcrawler/utils/__init__.py @@ -2,5 +2,5 @@ __all__ = [ "dependency_injector", - "Depends" + "Depends", ] diff --git a/fastcrawler/utils/injection.py b/fastcrawler/utils/injection.py index 730f9ac..0a33c26 100644 --- a/fastcrawler/utils/injection.py +++ b/fastcrawler/utils/injection.py @@ -5,36 +5,36 @@ class _Depends: - """ Dependancy injection to run callable as a dependency - """ + """Dependancy injection to run callable as a dependency""" + def __init__( - self, dependency: Callable[..., Any], + self, + dependency: Callable[..., Any], *, - use_cache: bool = False + use_cache: bool = False, ): self.dependency = dependency self.use_cache = use_cache self.result = ... async def async_eval(self): - """Run async callable dependnecy and store it as cache entry - """ + """Run async callable dependnecy and store it as cache entry""" if self.result is ...: self.result = await self.dependency() return self.result def sync_eval(self): - """Run sync callable dependency and store it as cache entry - """ + """Run sync callable dependency and store it as cache entry""" if self.result is ...: self.result = self.dependency() return self.result def __repr__(self) -> str: - """ Represents the callable dependency - """ + """Represents the callable dependency""" attr = getattr( - self.dependency, "__name__", type(self.dependency).__name__ + self.dependency, + "__name__", + type(self.dependency).__name__, ) cache = "" if self.use_cache else ", use_cache=False" return f"{self.__class__.__name__}({attr}{cache})" @@ -46,6 +46,7 @@ def dependency_injector(func): works for both async and sync """ if asyncio.iscoroutinefunction(func): + @wraps(func) async def async_wrapper(*args, **kwargs): sig = inspect.signature(func) @@ -67,6 +68,7 @@ async def async_wrapper(*args, **kwargs): return async_wrapper else: + @wraps(func) def sync_wrapper(*args, **kwargs): sig = inspect.signature(func) @@ -91,11 +93,11 @@ def sync_wrapper(*args, **kwargs): def Depends( dependency: Callable[..., Any], *, - use_cache: bool = False + use_cache: bool = False, ) -> Any: """The reason that an object was initiated from class, and the class wasn't called directly - is that because class __init__ method is returning only the instance of that class, - and that's not what we want, we want to assign this to another type (ANY), so I should - be using a function as interface to avoid IDE's error in type annotation or mypy. + is that because class __init__ method is returning only the instance of that class, + and that's not what we want, we want to assign this to another type (ANY), so I should + be using a function as interface to avoid IDE's error in type annotation or mypy. """ return _Depends(dependency=dependency, use_cache=use_cache) diff --git a/pyproject.toml b/pyproject.toml index fa017c6..5cc965c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,15 @@ [build-system] -requires = ["setuptools", "wheel", "flit_core >=2,<3"] build-backend = "setuptools.build_meta" +requires = ["setuptools", "wheel", "flit_core >=2,<3"] [tool.black] -line-length = 120 -target-version = ['py39', 'py310'] +line-length = 99 +target-version = ['py311'] [tool.flit.metadata.requires-extra] all = [ - "typer >=0.7.0,<1.0.0", - "colorama >=0.4.3,<0.5.0", - "shellingham >=1.3.0,<2.0.0", - "rich >=10.11.0,<14.0.0", -] \ No newline at end of file + "typer >=0.7.0,<1.0.0", + "colorama >=0.4.3,<0.5.0", + "shellingham >=1.3.0,<2.0.0", + "rich >=10.11.0,<14.0.0", +] diff --git a/requirements-dev.txt b/requirements-dev.txt index e4b37de..e217000 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,4 +9,6 @@ aiohttp fastapi uvicorn colorama -pyyaml \ No newline at end of file +pyyaml +redbird @ git+https://github.com/ManiMozaffar/red-bird@v2_but_v1 +rocketry @ git+https://github.com/ManiMozaffar/rocketry@master diff --git a/requirements/production.txt b/requirements/production.txt index 56585c1..a3e81b8 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -1,2 +1 @@ -r base.txt - diff --git a/setup.cfg b/setup.cfg index 92c23e9..d7436d9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,7 +11,7 @@ classifiers = License :: OSI Approved :: MIT License [options] -python_requires = >=3.9 +python_requires = >=3.11 package_dir = =fastcrawler packages = find_namespace: @@ -33,18 +33,19 @@ show_error_context = True pretty = True namespace_packages = True check_untyped_defs = True +exclude=[ docs_src/* ] -[flake8] -max-line-length = 120 -[tool:pytest] -testpaths = test -addopts = --cov --strict-markers -xfail_strict = True +[flake8] +max-line-length = 99 [coverage:run] source = fastcrawler branch = True +omit = + */contracts.py + */modest.py + */playwright.py [coverage:report] fail_under = 9.0 @@ -57,7 +58,7 @@ source = */site-packages/fastcrawler [tox:tox] -envlist = py39,py310 +envlist = py311 isolated_build = True [testenv] diff --git a/test/.coveragerc b/test/.coveragerc new file mode 100644 index 0000000..9d09d6f --- /dev/null +++ b/test/.coveragerc @@ -0,0 +1,5 @@ +[run] +omit = + *contracts.py + *modest.py + *playwright.py diff --git a/test/conftest.py b/test/conftest.py index b8e7461..9f567c2 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -12,7 +12,7 @@ import pytest import pytest_asyncio -from fastcrawler.schedule.adopter import RocketryApplication, RocketryManager +from fastcrawler.schedule.adopter import RocketryApplication, RocketryController @pytest.fixture @@ -64,4 +64,4 @@ def task_app(): @pytest.fixture(scope="function") def task_manager(task_app): - yield RocketryManager(task_app) + yield RocketryController(task_app) diff --git a/test/pytest.ini b/test/pytest.ini index f1518e6..e570bfa 100644 --- a/test/pytest.ini +++ b/test/pytest.ini @@ -2,4 +2,4 @@ markers = asyncio: Tests that require asyncio -addopts = --cov=fastcrawler --cov-report term-missing --disable-warnings +addopts = -v --cov=fastcrawler --cov-report term-missing --disable-warnings diff --git a/test/shared/engine.py b/test/shared/engine.py index b12cfd1..7448177 100644 --- a/test/shared/engine.py +++ b/test/shared/engine.py @@ -4,7 +4,7 @@ from pydantic_settings import BaseSettings from fastcrawler.engine import ProxySetting, SetCookieParam -from fastcrawler.engine.aio import AioHTTP +from fastcrawler.engine.aio import AioHttpEngine sample_cookies = [ SetCookieParam( @@ -68,6 +68,10 @@ def get_cookies(): async def get_aiohttp_engine(): headers = {} - engine = AioHTTP(cookies=get_cookies(), headers=headers, useragent=get_random_useragent()) + engine = AioHttpEngine( + cookies=get_cookies(), + headers=headers, + useragent=get_random_useragent(), + ) await engine.setup() return engine diff --git a/test/shared/fastapi/uvicorn.log.yaml b/test/shared/fastapi/uvicorn.log.yaml index 5a5028b..6dbfbc7 100644 --- a/test/shared/fastapi/uvicorn.log.yaml +++ b/test/shared/fastapi/uvicorn.log.yaml @@ -12,4 +12,4 @@ loggers: uvicorn: level: DEBUG handlers: - - console \ No newline at end of file + - console diff --git a/test/shared/mock_html.py b/test/shared/mock_html.py index 9e7f59b..6d50a4b 100644 --- a/test/shared/mock_html.py +++ b/test/shared/mock_html.py @@ -1,5 +1,6 @@ # pylint: skip-file + def get_html(): return """ diff --git a/test/shared/mock_json.py b/test/shared/mock_json.py index 9864601..1b81c5f 100644 --- a/test/shared/mock_json.py +++ b/test/shared/mock_json.py @@ -1,24 +1,16 @@ # pylint: skip-file + def get_json_data(): return { "results": [ - { - "id": 1, - "name": "Link 1" - }, - { - "id": 2, - "name": "Link 2" - }, - { - "id": 3, - "name": "Link 3" - } + {"id": 1, "name": "Link 1"}, + {"id": 2, "name": "Link 2"}, + {"id": 3, "name": "Link 3"}, ], "pagination": { "next_page": "http://address.com/item?page=3", - "last_page": "http://address.com/item?page=1" + "last_page": "http://address.com/item?page=1", }, - "end_page": "http://address.com/item?page=100" + "end_page": "http://address.com/item?page=100", } diff --git a/test/shared/schema.py b/test/shared/schema.py index a17652e..3518b64 100644 --- a/test/shared/schema.py +++ b/test/shared/schema.py @@ -11,7 +11,9 @@ class ListItem(BaseModel): name: str | None = XPATHField(query="//a", extract="text") source: str = "https://mywebsite.com" source_as_default: None | str = XPATHField( - query="//a[@nothing]", extract="text", default="Nothing" + query="//a[@nothing]", + extract="text", + default="Nothing", ) @@ -72,15 +74,15 @@ class Config: class LinksData(BaseModel): - link: list = RegexField(regex=r"href=['\"]([^'\"]+)['\"]", many=True) + link: list = RegexField(query=r"href=['\"]([^'\"]+)['\"]", many=True) class LinksDataSingle(BaseModel): - link: str = RegexField(regex=r"href=['\"]([^'\"]+)['\"]") + link: str = RegexField(query=r"href=['\"]([^'\"]+)['\"]") class EmailData(BaseModel): - emails: list | None = RegexField(regex=r"[\w.-]+@[\w.-]+\.\w+", default=None) + emails: list | None = RegexField(query=r"[\w.-]+@[\w.-]+\.\w+", default=None) # class MDT_Item(BaseModel): diff --git a/test/test_engine.py b/test/test_engine.py index 6913665..456129b 100644 --- a/test/test_engine.py +++ b/test/test_engine.py @@ -1,33 +1,57 @@ +# pylint: skip-file from test.conftest import get_proxy_setting from time import perf_counter import pytest -from fastcrawler.engine.aio import AioHTTP, Morsel +from fastcrawler.engine.aio import AioHttpEngine, Morsel + + +@pytest.mark.asyncio +async def test_not_setuped_aiohttp(): + engine = AioHttpEngine(cookies=None) + res = await engine.get(["http://127.0.0.1:8000/get"]) + assert engine.session is None + await engine.teardown() + assert res == [None] + + +@pytest.mark.asyncio +async def test_aiohttp_cookies_and_proxy_attr(cookies): + proxy = get_proxy_setting() + # None cookies + async with AioHttpEngine(cookies=None) as engine: + assert engine.cookies is None + # with cookies + async with AioHttpEngine(cookies=cookies, proxy=proxy) as engine: + assert engine.cookies == cookies + assert engine.proxy == proxy @pytest.mark.asyncio async def test_aiohttp_with_statement(user_agent): urls = ["http://127.0.0.1:8000/throtlled/3/"] * 10 useragent = user_agent - async with AioHTTP(useragent=useragent, connection_limit=5) as engine: + async with AioHttpEngine(useragent=useragent, connection_limit=5) as engine: responses = await engine.get(urls) for response in responses: - assert isinstance(response, str) + assert isinstance(response.text, str) assert len(responses) == len(urls) @pytest.mark.asyncio async def test_aiohttp_proxy(user_agent): urls = ["https://api.ipify.org?format=json"] + response = None useragent = user_agent proxy = get_proxy_setting() - engine = AioHTTP(useragent=useragent, proxy=proxy) + engine = AioHttpEngine(useragent=useragent, proxy=proxy) async with engine: responses = await engine.get(urls, verify_ssl=False) for response in responses: - assert isinstance(response, str) - assert proxy.server in response + assert isinstance(response.text, str) + assert response is not None + assert proxy.server in response.text @pytest.mark.asyncio @@ -37,10 +61,10 @@ async def test_aiohttp_get_request(user_agent, cookies): "http://127.0.0.1:8000/headers", "http://127.0.0.1:8000/cookies", ] - async with AioHTTP(useragent=user_agent, cookies=cookies) as engine: + async with AioHttpEngine(useragent=user_agent, cookies=cookies) as engine: responses = await engine.get(urls) for response in responses: - assert isinstance(response, str) + assert isinstance(response.text, str) @pytest.mark.asyncio @@ -48,43 +72,43 @@ async def test_aiohttp_get_wo_useragent_and_cookies_request(): urls = [ "http://127.0.0.1:8000/get", ] - async with AioHTTP() as engine: + async with AioHttpEngine() as engine: responses = await engine.get(urls) for response in responses: - assert isinstance(response, str) + assert isinstance(response.text, str) @pytest.mark.asyncio -async def test_aiohttp_post_request(aiohttp_engine: AioHTTP): +async def test_aiohttp_post_request(aiohttp_engine: AioHttpEngine): urls = ["http://127.0.0.1:8000/post"] datas = [{"key1": "value1", "key2": "value2"}, {"key3": "value3", "key4": "value4"}] responses = await aiohttp_engine.post(urls, datas) for response in responses: - assert isinstance(response, str) + assert isinstance(response.text, str) @pytest.mark.asyncio -async def test_aiohttp_put_request(aiohttp_engine: AioHTTP): +async def test_aiohttp_put_request(aiohttp_engine: AioHttpEngine): urls = ["http://127.0.0.1:8000/put"] datas = [{"key1": "value1", "key2": "value2"}, {"key3": "value3", "key4": "value4"}] responses = await aiohttp_engine.put(urls, datas) for response in responses: - assert isinstance(response, str) + assert isinstance(response.text, str) @pytest.mark.asyncio -async def test_aiohttp_delete_request(aiohttp_engine: AioHTTP): +async def test_aiohttp_delete_request(aiohttp_engine: AioHttpEngine): urls = ["http://127.0.0.1:8000/delete"] datas = [{"key1": "value1", "key2": "value2"}, {"key3": "value3", "key4": "value4"}] responses = await aiohttp_engine.delete(urls, datas) for response in responses: - assert isinstance(response, str) + assert isinstance(response.text, str) @pytest.mark.asyncio async def test_aiohttp_headers(headers, user_agent): expected_headers = {**headers, "User-Agent": user_agent} - async with AioHTTP(headers=headers, useragent=user_agent) as aiohttp_engine: + async with AioHttpEngine(headers=headers, useragent=user_agent) as aiohttp_engine: urls = [ "http://127.0.0.1:8000/headers/", ] @@ -103,13 +127,13 @@ def get_morsel(cookie): async def test_aiohttp_cookie(cookies, user_agent): cookies_origin = {cookie.name: get_morsel(cookie) for cookie in cookies} - async with AioHTTP(cookies=cookies, useragent=user_agent) as aiohttp_engine: + async with AioHttpEngine(cookies=cookies, useragent=user_agent) as aiohttp_engine: urls = [ "http://127.0.0.1:8000/cookies/", ] await aiohttp_engine.get(urls) cookies = aiohttp_engine.session.cookie_jar.filter_cookies( - str(aiohttp_engine.session._base_url) + str(aiohttp_engine.session._base_url), ) assert cookies_origin == cookies @@ -118,13 +142,15 @@ async def test_aiohttp_cookie(cookies, user_agent): async def test_limit_per_host(headers, user_agent): """only test limit per host for AioHTTP engine (V Test)""" - async with AioHTTP( - headers=headers, useragent=user_agent, connection_limit=5 + async with AioHttpEngine( + headers=headers, + useragent=user_agent, + connection_limit=3, ) as aiohttp_engine: - urls_1 = ["http://127.0.0.1:8000/throtlled/5/"] * 4 - urls_2 = ["http://127.0.0.1:8000/throtlled/10/"] * 4 + urls_1 = ["http://127.0.0.1:8000/throtlled/3/"] * 2 + urls_2 = ["http://127.0.0.1:8000/throtlled/5/"] start = perf_counter() await aiohttp_engine.get(urls_1 + urls_2 + urls_1) end = perf_counter() - assert end - start == pytest.approx(20, abs=1) + assert end - start == pytest.approx(6, abs=1) diff --git a/test/test_parser.py b/test/test_parser.py index 708779d..aa019c2 100644 --- a/test/test_parser.py +++ b/test/test_parser.py @@ -1,6 +1,4 @@ # pylint: skip-file -import pytest - from test.shared.schema import ( EmailData, InnerHTML, @@ -12,6 +10,9 @@ VeryNestedCSS, VeryNestedJson, ) + +import pytest + from fastcrawler.exceptions import ( ParserInvalidModelType, ParserValidationError, @@ -101,6 +102,7 @@ def test_base_selector(): obj = BaseSelector("Test", many=True) with pytest.raises(NotImplementedError): obj.resolve(None, None) + print(obj.__repr__()) assert obj.__repr__() == "Field(type=BaseSelector extract=None, many=True, query=Test)" diff --git a/test/test_registery.py b/test/test_registery.py index 2fad8f0..02c7110 100644 --- a/test/test_registery.py +++ b/test/test_registery.py @@ -7,13 +7,13 @@ def test_crawler_instances(): - obj_a = Crawler('arg1') - obj_b = Crawler('arg2', keyword_arg='key_arg1') + obj_a = Crawler("arg1") + obj_b = Crawler("arg2", keyword_arg="key_arg1") all_objs = Crawler.get_all_objects() assert obj_a in all_objs assert obj_b in all_objs - assert all_objs[obj_a] == (('arg1',), {}) - assert all_objs[obj_b] == (('arg2',), {'keyword_arg': 'key_arg1'}) + assert all_objs[obj_a] == (("arg1",), {}) + assert all_objs[obj_b] == (("arg2",), {"keyword_arg": "key_arg1"}) def test_crawler_with_task(): @@ -26,11 +26,18 @@ class cls_B(Spider): class cls_C(Spider): pass - obj = Crawler(cls_A >> cls_B >> cls_C) - assert [cls_A, cls_B, cls_C] == obj.task.instances + obj1 = cls_A() + obj2 = cls_B() + obj3 = cls_C() + obj = Crawler(obj1 >> obj2 >> obj3) + assert [obj1, obj2, obj3] == obj.task.instances client_one = FastCrawler(crawlers=obj) - client_two = FastCrawler(crawlers=[obj, ]) + client_two = FastCrawler( + crawlers=[ + obj, + ] + ) assert client_one.crawlers == client_two.crawlers with pytest.raises(NoCrawlerFoundError): FastCrawler(crawlers=None) diff --git a/test/test_schedule.py b/test/test_schedule.py index 1a6bcf4..ea27dab 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -2,7 +2,7 @@ import pytest -from fastcrawler.schedule.adopter import RocketryApplication, RocketryManager, TaskNotFound +from fastcrawler.schedule.adopter import RocketryApplication, RocketryController, TaskNotFound from fastcrawler.schedule.schema import Task @@ -55,24 +55,22 @@ async def test_get_all_task_to_rocketry_application(task_app: RocketryApplicatio async def test_shutdown_rocketry_application(task_app: RocketryApplication): new_task_1 = get_task(1) await task_app.add_task(task_function, new_task_1) - await asyncio.sleep(1) await task_app.shut_down() await asyncio.sleep(1) assert not task_app.task_lib.session.scheduler.is_alive -# @pytest.mark.asyncio -# async def test_serve_rocketry_application(task_app: RocketryApplication): -# new_task_1 = get_task(1) -# await task_app.add_task(task_function, new_task_1) -# await asyncio.sleep(1) -# await task_app.serve() -# await asyncio.sleep(1) -# assert task_app.task_lib.session.scheduler.is_alive +@pytest.mark.asyncio +async def test_serve_rocketry_application(task_app: RocketryApplication): + new_task_1 = get_task(1) + await task_app.add_task(task_function, new_task_1) + asyncio.create_task(task_app.serve()) + await asyncio.sleep(1) + assert task_app.task_lib.session.scheduler.is_alive @pytest.mark.asyncio -async def test_add_task_to_manager(task_manager: RocketryManager): +async def test_add_task_to_manager(task_manager: RocketryController): new_task_1 = get_task(1) task_names = {new_task_1.name} await task_manager.add_task(task_function, new_task_1) @@ -82,7 +80,7 @@ async def test_add_task_to_manager(task_manager: RocketryManager): @pytest.mark.asyncio -async def test_all_tasks_from_manager(task_manager: RocketryManager): +async def test_all_tasks_from_manager(task_manager: RocketryController): new_task_1 = get_task(1) new_task_2 = get_task(2) task_names = {task.name for task in (new_task_1, new_task_2)} @@ -94,7 +92,7 @@ async def test_all_tasks_from_manager(task_manager: RocketryManager): @pytest.mark.asyncio -async def test_change_task_schedule_from_manager(task_manager: RocketryManager): +async def test_change_task_schedule_from_manager(task_manager: RocketryController): new_task_1 = get_task(1) await task_manager.add_task(task_function, new_task_1) # if any problem is encountered during change_task_schedule it should raise an exception @@ -103,7 +101,7 @@ async def test_change_task_schedule_from_manager(task_manager: RocketryManager): @pytest.mark.asyncio -async def test_change_task_schedule_string_from_manager(task_manager: RocketryManager): +async def test_change_task_schedule_string_from_manager(task_manager: RocketryController): new_task_1 = get_task(1) await task_manager.add_task(task_function, new_task_1) # if any problem is encountered during change_task_schedule it should raise an exception @@ -112,7 +110,7 @@ async def test_change_task_schedule_string_from_manager(task_manager: RocketryMa @pytest.mark.asyncio -async def test_fail_test_change_task_schedule_from_manager(task_manager: RocketryManager): +async def test_fail_test_change_task_schedule_from_manager(task_manager: RocketryController): new_task_1 = get_task(1) with pytest.raises(TaskNotFound): await task_manager.add_task(task_function, new_task_1) @@ -120,7 +118,7 @@ async def test_fail_test_change_task_schedule_from_manager(task_manager: Rocketr @pytest.mark.asyncio -async def test_toggle_task_not_disabled_from_manager(task_manager: RocketryManager): +async def test_toggle_task_not_disabled_from_manager(task_manager: RocketryController): new_task_1 = get_task(1) await task_manager.add_task(task_function, new_task_1) # if any problem is encountered during toggle_task it should raise an exception @@ -129,7 +127,7 @@ async def test_toggle_task_not_disabled_from_manager(task_manager: RocketryManag @pytest.mark.asyncio -async def test_toggle_task_disabled_from_manager(task_manager: RocketryManager): +async def test_toggle_task_disabled_from_manager(task_manager: RocketryController): new_task_1 = get_task(1) new_task_1.disabled = True await task_manager.add_task(task_function, new_task_1) @@ -139,7 +137,7 @@ async def test_toggle_task_disabled_from_manager(task_manager: RocketryManager): @pytest.mark.asyncio -async def test_toggle_task_not_found_from_manager(task_manager: RocketryManager): +async def test_toggle_task_not_found_from_manager(task_manager: RocketryController): new_task_1 = get_task(1) await task_manager.add_task(task_function, new_task_1) # if any problem is encountered during toggle_task it should raise an exception diff --git a/tox.ini b/tox.ini index ecc40c2..9d09045 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py{37,38,39,10,11} +envlist = py{311} [testenv] deps =