From 8e26e4c15621dd22f05ee33c80808f999da22b31 Mon Sep 17 00:00:00 2001 From: PuQing Date: Sun, 28 Jan 2024 00:15:05 +0800 Subject: [PATCH] Add ICLR and ICML sources, and feedparser dependency --- backend/app/app/models.py | 4 ++ backend/app/app/source/Arxiv.py | 64 ++++++++++++++++++++++++++++++ backend/app/app/source/ICLR.py | 6 +++ backend/app/app/source/ICML.py | 6 +++ backend/app/app/source/NIPS.py | 13 ++---- backend/app/app/source/__init__.py | 6 ++- backend/app/app/source/base.py | 44 ++++++++++++++++++++ backend/app/app/worker.py | 8 ++-- backend/app/poetry.lock | 26 +++++++++++- backend/app/pyproject.toml | 1 + 10 files changed, 161 insertions(+), 17 deletions(-) create mode 100644 backend/app/app/source/Arxiv.py create mode 100644 backend/app/app/source/ICLR.py create mode 100644 backend/app/app/source/ICML.py diff --git a/backend/app/app/models.py b/backend/app/app/models.py index ee29778..bb5bc3c 100644 --- a/backend/app/app/models.py +++ b/backend/app/app/models.py @@ -88,6 +88,10 @@ class Item(ItemBase, table=True): sa_column=Column(JSON), ) from_source: str = Field(nullable=False) + category: Union[list[str], None] = Field( + default=None, + sa_column=Column(JSON), + ) # Properties to return via API, id is always required diff --git a/backend/app/app/source/Arxiv.py b/backend/app/app/source/Arxiv.py new file mode 100644 index 0000000..23c5c22 --- /dev/null +++ b/backend/app/app/source/Arxiv.py @@ -0,0 +1,64 @@ +import logging +import re +from typing import Any + +from scrapy.http import HtmlResponse + +from app.source.base import PaperRequestsTask, RSSTask + + +class Arxiv(RSSTask): + url: str = "http://export.arxiv.org/rss/cs" + name: str = "Arxiv" + _cache_category_map: dict[str, str] = {} + + @staticmethod + def parse(entry: dict) -> dict[str, Any]: + return { + "title": entry["title"], + "authors": entry["author"], + "url": entry["link"], + "abstract": entry["summary"], + } + + @property + def category_map(self): + if not self._cache_category_map: + response = PaperRequestsTask._request( + "https://arxiv.org/category_taxonomy", + ) + if response is None: + return {} + response = HtmlResponse( + url="", + body=response.text, + encoding="utf-8", + ) + category = response.css("h4::text").getall() + full_name = response.css("span::text").getall() + for i, c in enumerate(category): + self._cache_category_map[c] = ( + full_name[i].replace("(", "").replace(")", "") + ) + + return self._cache_category_map + + def post_parse(self, entry: dict[str, Any]) -> dict[str, Any]: + entry["title"] = entry["title"].split("(", 1)[0] + entry["authors"] = ( + HtmlResponse(url="", body=entry["authors"], encoding="utf-8") + .css("a::text") + .getall() + ) + entry["abstract"] = ( + HtmlResponse(url="", body=entry["abstract"], encoding="utf-8") + .css("p::text") + .get() + ) + category = re.findall(r"\[(.*?)\]", entry["title"])[0] + if category in Arxiv.category_map: + entry["category"] = self.category_map[category] + else: + logging.warning(f"Unknown category: {category}") + entry["category"] = None + return entry diff --git a/backend/app/app/source/ICLR.py b/backend/app/app/source/ICLR.py new file mode 100644 index 0000000..e70dfe7 --- /dev/null +++ b/backend/app/app/source/ICLR.py @@ -0,0 +1,6 @@ +from app.source import NIPS + + +class ICLR(NIPS): + url: str = "https://iclr.cc/Conferences/2023/Schedule?type=Poster" + name: str = "ICLR" diff --git a/backend/app/app/source/ICML.py b/backend/app/app/source/ICML.py new file mode 100644 index 0000000..967a005 --- /dev/null +++ b/backend/app/app/source/ICML.py @@ -0,0 +1,6 @@ +from app.source import NIPS + + +class ICML(NIPS): + url: str = "https://icml.cc/Conferences/2023/Schedule?type=Poster" + name: str = "ICML" diff --git a/backend/app/app/source/NIPS.py b/backend/app/app/source/NIPS.py index c723800..d6f4f7b 100644 --- a/backend/app/app/source/NIPS.py +++ b/backend/app/app/source/NIPS.py @@ -2,12 +2,12 @@ from scrapy.http import HtmlResponse -from app.source.base import PaperRequestsTask +from app.source.base import PaperRequestsTask, openreview_url -class Nips(PaperRequestsTask): +class NIPS(PaperRequestsTask): url: str = "https://nips.cc/Conferences/2023/Schedule?type=Poster" - name: str = "Nips" + name: str = "NIPS" @staticmethod def parse_urls(response: HtmlResponse) -> list[str]: @@ -41,10 +41,3 @@ def post_parse(item: dict[str, Any]) -> dict[str, Any]: for i, author in enumerate(item["authors"]): item["authors"][i] = author.strip() return item - - -def openreview_url(urls): - for url in urls[::-1]: - if "openreview" in url: - return url - return urls[0] # if no openreview url, return the first url diff --git a/backend/app/app/source/__init__.py b/backend/app/app/source/__init__.py index 80b3ed0..ae27c98 100644 --- a/backend/app/app/source/__init__.py +++ b/backend/app/app/source/__init__.py @@ -1,4 +1,6 @@ from app.source.base import PaperRequestsTask -from app.source.NIPS import Nips +from app.source.ICLR import ICLR +from app.source.ICML import ICML +from app.source.NIPS import NIPS -__all__ = ["PaperRequestsTask", "Nips"] +__all__ = ["PaperRequestsTask", "NIPS", "ICLR", "ICML"] diff --git a/backend/app/app/source/base.py b/backend/app/app/source/base.py index bcb8913..a891325 100644 --- a/backend/app/app/source/base.py +++ b/backend/app/app/source/base.py @@ -1,14 +1,23 @@ import logging from typing import Any +import feedparser import requests from celery import Task +from feedparser import FeedParserDict from scrapy.http import HtmlResponse from sqlmodel import Session from app.models import CrawledItem, Item +def openreview_url(urls): + for url in urls[::-1]: + if "openreview" in url: + return url + return urls[0] # if no openreview url, return the first url + + class PaperRequestsTask(Task): url: str ignore_result: bool = True @@ -91,3 +100,38 @@ def run(self, urls: list[str]): results.append((url, item)) self.save(results) + + +class RSSTask(Task): + name: str + url: str + ignore_result: bool = True + + @property + def db(self): + """ + Lazy loading of database connection. + """ + from app.db.engine import engine + + return Session(engine) + + @staticmethod + def parse(entry) -> dict[str, Any]: + raise NotImplementedError + + def post_parse(self, entry: dict[str, Any]) -> dict[str, Any]: + return entry + + def run(self): + feed: FeedParserDict = feedparser.parse(self.url) + results = [] + for entry in feed.entries: + item = self.parse(entry) + item = self.post_parse(item) + + if item["title"] is None or item["abstract"] is None: + logging.warning(f"Empty title or abstract: {entry.link}") + continue + + results.append((entry.link, item)) diff --git a/backend/app/app/worker.py b/backend/app/app/worker.py index ce31ac5..dac193b 100644 --- a/backend/app/app/worker.py +++ b/backend/app/app/worker.py @@ -7,9 +7,9 @@ from app.core.celery_app import celery_app from app.core.config import settings from app.models import CrawledItem -from app.source import Nips +from app.source import NIPS -celery_app.register_task(Nips()) +celery_app.register_task(NIPS()) def batch(iterable: Union[set[str], list[str]], n: int = 1): @@ -43,7 +43,7 @@ def db(self) -> Session: ignore_result=True, ) def test_celery_worker(self: DatabaseTask, word: str) -> None: - urls = set(Nips.get_urls()) + urls = set(NIPS.get_urls()) # remove duplicates from db with self.db as db: @@ -57,4 +57,4 @@ def test_celery_worker(self: DatabaseTask, word: str) -> None: logging.info(f"Cache hit rate: {cache_hit_rate * 100:.2f}%") for url in batch(urls, settings.REQUESTS_BATCH_SIZE): - celery_app.send_task("Nips", kwargs={"urls": url}) + celery_app.send_task("NIPS", kwargs={"urls": url}) diff --git a/backend/app/poetry.lock b/backend/app/poetry.lock index f028330..4106188 100644 --- a/backend/app/poetry.lock +++ b/backend/app/poetry.lock @@ -836,6 +836,20 @@ typing-extensions = ">=4.5.0" [package.extras] all = ["email-validator (>=2.0.0)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] +[[package]] +name = "feedparser" +version = "6.0.11" +description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds" +optional = false +python-versions = ">=3.6" +files = [ + {file = "feedparser-6.0.11-py3-none-any.whl", hash = "sha256:0be7ee7b395572b19ebeb1d6aafb0028dee11169f1c934e0ed67d54992f4ad45"}, + {file = "feedparser-6.0.11.tar.gz", hash = "sha256:c9d0407b64c6f2a065d0ebb292c2b35c01050cc0dc33757461aaabdc4c4184d5"}, +] + +[package.dependencies] +sgmllib3k = "*" + [[package]] name = "filelock" version = "3.13.1" @@ -2753,6 +2767,16 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +[[package]] +name = "sgmllib3k" +version = "1.0.0" +description = "Py3k port of sgmllib." +optional = false +python-versions = "*" +files = [ + {file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"}, +] + [[package]] name = "six" version = "1.16.0" @@ -3660,4 +3684,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "f0642aa00fc65a89ca38becf3cd795d97cba3a89fdaee47d696f38004d779644" +content-hash = "f5f91fbf95ff3897b2922797fd22c9c7c8853b8ca9f9e22baee985a61d70495f" diff --git a/backend/app/pyproject.toml b/backend/app/pyproject.toml index a0fb249..b12db50 100644 --- a/backend/app/pyproject.toml +++ b/backend/app/pyproject.toml @@ -29,6 +29,7 @@ celery = "^5.3.6" toml = "^0.10.2" psycopg = { extras = ["binary"], version = "^3.1.13" } scrapy = "^2.11.0" +feedparser = "^6.0.11" [tool.poetry.dev-dependencies]