Skip to content

Commit

Permalink
♻️ Redesigned Boilerplate (#39)
Browse files Browse the repository at this point in the history
* ♻️ refactor design

* 🗑️ Deprecated selectolax

* ♻️ improved contracts and base design

* ✅ add rocketry serve test

* 🦺 Update AioHttpEngine `cookies` and `proxy` type

* ✅ Add engine test for attrs

* 🎨 Update pre-commit and `pyproject.toml` and `setup.cfg`

* 🎨 Apply format with conf on all project

* 🔧 update config files

* 🚨 fix mypy type errors

* 🎨 improve core adaptor

* ✅ Update tests

* 👷 Update req-dev

* 👷 Update tox.ini

* 🎨 Improved and centralized protos

* ⚡️ Update `_get_cookie` and add a test for non-setuped engine

---------

Co-authored-by: Sadegh Yazdani
  • Loading branch information
ManiMozaffar authored and aerosadegh committed Jul 16, 2023
1 parent 4d9459e commit 7009996
Show file tree
Hide file tree
Showing 51 changed files with 624 additions and 343 deletions.
7 changes: 6 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ repos:
- id: check-yaml
- id: check-added-large-files
- repo: 'https://github.com/psf/black'
rev: 23.3.0
rev: 23.7.0
hooks:
- id: black
- repo: 'https://github.com/PyCQA/flake8'
Expand All @@ -18,3 +18,8 @@ repos:
rev: v1.4.1
hooks:
- id: mypy
name: mypy (fastcrawler)
files: ^fastcrawler/
# - id: mypy
# name: mypy (test)
# files: ^test/
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
# fastcrawler
# fastcrawler
2 changes: 1 addition & 1 deletion docs_src/initilizing_project/sample1/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
from fastcrawler import FastCrawler

app = FastCrawler(
crawlers=wiki_spider
crawlers=wiki_spider,
)
11 changes: 7 additions & 4 deletions docs_src/initilizing_project/sample1/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


from fastcrawler import BaseModel, Crawler, CSSField, Spider, XPATHField
from fastcrawler.engine import AioHTTP
from fastcrawler.engine import AioHttpEngine


class PageResolver(BaseModel):
Expand All @@ -16,21 +16,24 @@ class ArticleData(BaseModel):


class WikiBaseSpider(Spider):
engine = AioHTTP
engine = AioHttpEngine
concurrency = 100


class WikiArticleFinder(WikiBaseSpider):
data_model = PageResolver
req_count = 1_000_000
start_url = ["https://meta.wikimedia.org/wiki/List_of_Wikipedias", ]
start_url = [
"https://meta.wikimedia.org/wiki/List_of_Wikipedias",
]


class WikiArticleRetirever(WikiBaseSpider):
data_model = ArticleData
req_count = 1_000_000

async def save_data(self, data: ArticleData): ... # save parsed data to database
async def save_data(self, data: ArticleData):
... # save parsed data to database


wiki_spider = Crawler(WikiArticleFinder >> WikiArticleRetirever)
9 changes: 7 additions & 2 deletions fastcrawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from .core import Crawler, FastCrawler, Spider
from .parsers import BaseModel, CSSField, XPATHField, RegexField
from .engine import AioHttpEngine
from .parsers import BaseModel, CSSField, RegexField, XPATHField
from .schedule import RocketryApplication, RocketryController
from .utils import Depends

__all__ = [
Expand All @@ -10,5 +12,8 @@
"Depends",
"Spider",
"Crawler",
"FastCrawler"
"FastCrawler",
"RocketryApplication",
"RocketryController",
"AioHttpEngine",
]
2 changes: 1 addition & 1 deletion fastcrawler/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@
__all__ = [
"Crawler",
"Spider",
"FastCrawler"
"FastCrawler",
]
1 change: 1 addition & 0 deletions fastcrawler/core/registery.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class CrawlerMeta(type):
DONT TOUCH THIS CLASS UNLESS YOU KNOW WHAT YOU ARE DOING.
"""

def __init__(cls, name, bases, dct):
super().__init__(name, bases, dct)
cls._instances = {}
Expand Down
30 changes: 16 additions & 14 deletions fastcrawler/core/spider.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
from typing import List


class SpiderMetaClass(type):
class Spider:
"""
Spider class to create the actual spider interface
so that configuration of each spider can be given
as class properties from the inheritanced class from spider
instances property hold the instances that were set by metaclass
that is connected to current spider class
"""

instances: List["Spider"]

def __init__(self):
...

def __rshift__(self, other: "Spider") -> "Spider":
"""
leveraged RSHIFT method for magic in flow >>
objA >> objB >> objC >> objD
clsA >> clsB >> clsC >> clsD
Must be used as metaclass to inject behaviour to subclass
Expand All @@ -17,15 +31,3 @@ def __rshift__(self, other: "Spider") -> "Spider":
self.instances.append(other)
setattr(other, "instances", self.instances)
return other


class Spider(metaclass=SpiderMetaClass):
"""
Spider class to create the actual spider interface
so that configuration of each spider can be given
as class properties from the inheritanced class from spider
instances property hold the instances that were set by metaclass
that is connected to current spider class
"""
instances: List["Spider"]
6 changes: 3 additions & 3 deletions fastcrawler/engine/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from .aio import AioHTTP
from .base import ProxySetting, SetCookieParam
from .aio import AioHttpEngine
from .contracts import ProxySetting, SetCookieParam

__all__ = [
"ProxySetting",
"SetCookieParam",
"AioHTTP",
"AioHttpEngine",
]
96 changes: 68 additions & 28 deletions fastcrawler/engine/aio.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import asyncio
from typing import Any

import pydantic
from aiohttp import BasicAuth, ClientSession, TCPConnector
from aiohttp.client import ClientResponse
from aiohttp.cookiejar import Morsel

from fastcrawler.engine.base import ProxySetting, SetCookieParam
from fastcrawler.engine.contracts import ProxySetting, Response, SetCookieParam


class AioHTTP:
class AioHttpEngine:
def __init__(
self,
cookies: list[SetCookieParam] | None = None,
Expand All @@ -17,9 +19,9 @@ def __init__(
connection_limit: int = 100,
):
"""Initialize a new engine instance with given cookie, header, useragent, and proxy"""
self.session = None
self.session: None | ClientSession = None
self._cookies = (
[(cookie.name, self.get_morsel_cookie(cookie)) for cookie in cookies]
[(cookie.name, self._get_morsel_cookie(cookie)) for cookie in cookies]
if cookies is not None
else None
)
Expand All @@ -30,29 +32,39 @@ def __init__(

self._connector = TCPConnector(limit_per_host=connection_limit)

self._proxy = {}
self._proxy: dict[Any, Any] = {}
self.proxy_dct = proxy
if proxy:
proxy_url = f"{proxy.protocol}{proxy.server}:{proxy.port}"
self._proxy["proxy"] = proxy_url
if proxy.username and proxy.password:
auth = BasicAuth(login=proxy.username, password=proxy.password)
self._proxy["proxy_auth"] = auth
self._proxy["proxy_auth"] = BasicAuth(
login=proxy.username, password=proxy.password
)

@property
def cookies(self):
return self._cookies
def cookies(self) -> list[SetCookieParam] | None:
"""Return cookies"""
cookies = None
if self._cookies is not None:
cookies = [self._get_cookie(cookie) for _, cookie in self._cookies]

return cookies

@property
def headers(self):
def headers(self) -> dict:
"""Return headers"""
return self._headers

@property
def proxy(self):
return self._proxy
def proxy(self) -> ProxySetting | None:
"""Return proxy setting"""
return self.proxy_dct

def get_morsel_cookie(self, cookie: SetCookieParam) -> Morsel:
@staticmethod
def _get_morsel_cookie(cookie: SetCookieParam) -> Morsel:
"""Converts a SetCookieParam object to an Morsel object."""
morsel_obj = Morsel()
morsel_obj: Morsel = Morsel()
morsel_obj.set(cookie.name, cookie.value, cookie.value)
morsel_obj.update(
dict(
Expand All @@ -66,6 +78,21 @@ def get_morsel_cookie(self, cookie: SetCookieParam) -> Morsel:
)
return morsel_obj

@staticmethod
def _get_cookie(cookie: Morsel) -> SetCookieParam:
"""convert Morsel object to SetCookieParam object"""
cookie_params = {
"name": cookie.key,
"value": cookie.value,
"domain": cookie.get("domain"),
"path": cookie.get("path"),
"expires": cookie.get("expires"),
"httpOnly": cookie.get("httponly"),
"secure": cookie.get("secure"),
"sameSite": cookie.get("samesite"),
}
return SetCookieParam(**cookie_params)

async def __aenter__(self):
"""Async context manager support for engine -> ENTER"""
await self.setup()
Expand All @@ -79,46 +106,59 @@ async def setup(self, **kwargs) -> None:
"""Set-up up the engine for crawling purpose."""
self.session = ClientSession(
connector=self._connector,
cookies=self.cookies,
cookies=self._cookies,
headers=self.headers,
trust_env=True,
**kwargs,
)

async def teardown(self) -> None:
"""Cleans up the engine."""
await self.session.close()
if self.session:
await self.session.close()

async def base(self, url: pydantic.AnyUrl, method: str, data: dict, **kwargs) -> str:
async def base(
self, url: pydantic.AnyUrl, method: str, data: dict | None, **kwargs
) -> Response | None:
"""Base Method for protocol to retrieve a list of URL."""

async with self.session.request(
method, url, data=data, headers=self.headers, **self.proxy, **kwargs
) as response:
return await response.text()

async def get(self, urls: list[pydantic.AnyUrl], **kwargs) -> list[str] | str:
if self.session:
async with self.session.request(
method, str(url), data=data, headers=self.headers, **self._proxy, **kwargs
) as response:
return await self.translate_to_response(response)
return None

async def get(self, urls: list[pydantic.AnyUrl], **kwargs) -> list[Response]:
"""GET HTTP Method for protocol to retrieve a list of URL."""
tasks = [self.base(url, "GET", None, **kwargs) for url in urls]
return await asyncio.gather(*tasks)

async def post(
self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs
) -> list[str] | str:
) -> list[Response]:
"""POST HTTP Method for protocol to crawl a list of URL."""
tasks = [self.base(url, "POST", data=data, **kwargs) for url, data in zip(urls, datas)]
return await asyncio.gather(*tasks)

async def put(
self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs
) -> list[str] | str:
) -> list[Response]:
"""PUT HTTP Method for protocol to crawl a list of URL."""
tasks = [self.base(url, "PUT", data=data) for url, data in zip(urls, datas)]
tasks = [self.base(url, "PUT", data=data, **kwargs) for url, data in zip(urls, datas)]
return await asyncio.gather(*tasks)

async def delete(
self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs
) -> list[str] | str:
) -> list[Response]:
"""DELETE HTTP Method for protocol to crawl a list of URL."""
tasks = [self.base(url, "DELETE", data=data, **kwargs) for url, data in zip(urls, datas)]
return await asyncio.gather(*tasks)

async def translate_to_response(self, response_obj: ClientResponse) -> Response:
"""Translate aiohttp response object to Response object"""
return Response(
text=await response_obj.text(),
status_code=response_obj.status,
headers=response_obj.headers,
cookie=response_obj.cookies,
)
Empty file removed fastcrawler/engine/constants.py
Empty file.
26 changes: 18 additions & 8 deletions fastcrawler/engine/base.py → fastcrawler/engine/contracts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@


class SetCookieParam(pydantic.BaseModel):
name: str
value: str
name: str = ""
value: str = ""
url: str | None = None
domain: str = ""
path: str | None = None
expires: float | None = None
httpOnly: bool | None = None
secure: bool | None = None
sameSite: Literal["Lax", "None", "Strict"] | None = None
path: str = ""
expires: str = ""
httpOnly: str = ""
secure: str = ""
sameSite: str | Literal["Lax", "None", "Strict"] = ""


class ProxySetting(pydantic.BaseModel):
Expand All @@ -26,6 +26,13 @@ class ProxySetting(pydantic.BaseModel):
password: str | None = None


class Response(pydantic.BaseModel):
text: str | None = None
status_code: int | None = None
headers: dict | None = None
cookie: dict | None = None


class EngineProto(Protocol):
def __init__(
self,
Expand All @@ -34,7 +41,7 @@ def __init__(
useragent: str | None,
proxy: ProxySetting | None,
):
"""Initialize a new engine instance with given cookie(s), header(s), useragent, and proxy"""
"Initialize a new engine instance with given cookie(s), header(s), useragent, and proxy"

async def __aenter__(self):
"""Async context manager support for engine -> ENTER"""
Expand Down Expand Up @@ -62,3 +69,6 @@ async def put(self, urls: list[pydantic.AnyUrl], datas: list[dict]) -> str:

async def delete(self, urls: list[pydantic.AnyUrl], datas: list[dict]) -> str:
"""DELETE HTTP Method for protocol to crawl a list of URL."""

async def translate_to_response(self, response_obj: type) -> Response:
"""Translate the response object to a Response object"""
Loading

0 comments on commit 7009996

Please sign in to comment.