From 8e26e4c15621dd22f05ee33c80808f999da22b31 Mon Sep 17 00:00:00 2001
From: PuQing <me@puqing.work>
Date: Sun, 28 Jan 2024 00:15:05 +0800
Subject: [PATCH] Add ICLR and ICML sources, and feedparser dependency

---
 backend/app/app/models.py          |  4 ++
 backend/app/app/source/Arxiv.py    | 64 ++++++++++++++++++++++++++++++
 backend/app/app/source/ICLR.py     |  6 +++
 backend/app/app/source/ICML.py     |  6 +++
 backend/app/app/source/NIPS.py     | 13 ++----
 backend/app/app/source/__init__.py |  6 ++-
 backend/app/app/source/base.py     | 44 ++++++++++++++++++++
 backend/app/app/worker.py          |  8 ++--
 backend/app/poetry.lock            | 26 +++++++++++-
 backend/app/pyproject.toml         |  1 +
 10 files changed, 161 insertions(+), 17 deletions(-)
 create mode 100644 backend/app/app/source/Arxiv.py
 create mode 100644 backend/app/app/source/ICLR.py
 create mode 100644 backend/app/app/source/ICML.py

diff --git a/backend/app/app/models.py b/backend/app/app/models.py
index ee29778..bb5bc3c 100644
--- a/backend/app/app/models.py
+++ b/backend/app/app/models.py
@@ -88,6 +88,10 @@ class Item(ItemBase, table=True):
         sa_column=Column(JSON),
     )
     from_source: str = Field(nullable=False)
+    category: Union[list[str], None] = Field(
+        default=None,
+        sa_column=Column(JSON),
+    )
 
 
 # Properties to return via API, id is always required
diff --git a/backend/app/app/source/Arxiv.py b/backend/app/app/source/Arxiv.py
new file mode 100644
index 0000000..23c5c22
--- /dev/null
+++ b/backend/app/app/source/Arxiv.py
@@ -0,0 +1,64 @@
+import logging
+import re
+from typing import Any
+
+from scrapy.http import HtmlResponse
+
+from app.source.base import PaperRequestsTask, RSSTask
+
+
+class Arxiv(RSSTask):
+    url: str = "http://export.arxiv.org/rss/cs"
+    name: str = "Arxiv"
+    _cache_category_map: dict[str, str] = {}
+
+    @staticmethod
+    def parse(entry: dict) -> dict[str, Any]:
+        return {
+            "title": entry["title"],
+            "authors": entry["author"],
+            "url": entry["link"],
+            "abstract": entry["summary"],
+        }
+
+    @property
+    def category_map(self):
+        if not self._cache_category_map:
+            response = PaperRequestsTask._request(
+                "https://arxiv.org/category_taxonomy",
+            )
+            if response is None:
+                return {}
+            response = HtmlResponse(
+                url="",
+                body=response.text,
+                encoding="utf-8",
+            )
+            category = response.css("h4::text").getall()
+            full_name = response.css("span::text").getall()
+            for i, c in enumerate(category):
+                self._cache_category_map[c] = (
+                    full_name[i].replace("(", "").replace(")", "")
+                )
+
+        return self._cache_category_map
+
+    def post_parse(self, entry: dict[str, Any]) -> dict[str, Any]:
+        entry["title"] = entry["title"].split("(", 1)[0]
+        entry["authors"] = (
+            HtmlResponse(url="", body=entry["authors"], encoding="utf-8")
+            .css("a::text")
+            .getall()
+        )
+        entry["abstract"] = (
+            HtmlResponse(url="", body=entry["abstract"], encoding="utf-8")
+            .css("p::text")
+            .get()
+        )
+        category = re.findall(r"\[(.*?)\]", entry["title"])[0]
+        if category in Arxiv.category_map:
+            entry["category"] = self.category_map[category]
+        else:
+            logging.warning(f"Unknown category: {category}")
+            entry["category"] = None
+        return entry
diff --git a/backend/app/app/source/ICLR.py b/backend/app/app/source/ICLR.py
new file mode 100644
index 0000000..e70dfe7
--- /dev/null
+++ b/backend/app/app/source/ICLR.py
@@ -0,0 +1,6 @@
+from app.source import NIPS
+
+
+class ICLR(NIPS):
+    url: str = "https://iclr.cc/Conferences/2023/Schedule?type=Poster"
+    name: str = "ICLR"
diff --git a/backend/app/app/source/ICML.py b/backend/app/app/source/ICML.py
new file mode 100644
index 0000000..967a005
--- /dev/null
+++ b/backend/app/app/source/ICML.py
@@ -0,0 +1,6 @@
+from app.source import NIPS
+
+
+class ICML(NIPS):
+    url: str = "https://icml.cc/Conferences/2023/Schedule?type=Poster"
+    name: str = "ICML"
diff --git a/backend/app/app/source/NIPS.py b/backend/app/app/source/NIPS.py
index c723800..d6f4f7b 100644
--- a/backend/app/app/source/NIPS.py
+++ b/backend/app/app/source/NIPS.py
@@ -2,12 +2,12 @@
 
 from scrapy.http import HtmlResponse
 
-from app.source.base import PaperRequestsTask
+from app.source.base import PaperRequestsTask, openreview_url
 
 
-class Nips(PaperRequestsTask):
+class NIPS(PaperRequestsTask):
     url: str = "https://nips.cc/Conferences/2023/Schedule?type=Poster"
-    name: str = "Nips"
+    name: str = "NIPS"
 
     @staticmethod
     def parse_urls(response: HtmlResponse) -> list[str]:
@@ -41,10 +41,3 @@ def post_parse(item: dict[str, Any]) -> dict[str, Any]:
             for i, author in enumerate(item["authors"]):
                 item["authors"][i] = author.strip()
         return item
-
-
-def openreview_url(urls):
-    for url in urls[::-1]:
-        if "openreview" in url:
-            return url
-    return urls[0]  # if no openreview url, return the first url
diff --git a/backend/app/app/source/__init__.py b/backend/app/app/source/__init__.py
index 80b3ed0..ae27c98 100644
--- a/backend/app/app/source/__init__.py
+++ b/backend/app/app/source/__init__.py
@@ -1,4 +1,6 @@
 from app.source.base import PaperRequestsTask
-from app.source.NIPS import Nips
+from app.source.ICLR import ICLR
+from app.source.ICML import ICML
+from app.source.NIPS import NIPS
 
-__all__ = ["PaperRequestsTask", "Nips"]
+__all__ = ["PaperRequestsTask", "NIPS", "ICLR", "ICML"]
diff --git a/backend/app/app/source/base.py b/backend/app/app/source/base.py
index bcb8913..a891325 100644
--- a/backend/app/app/source/base.py
+++ b/backend/app/app/source/base.py
@@ -1,14 +1,23 @@
 import logging
 from typing import Any
 
+import feedparser
 import requests
 from celery import Task
+from feedparser import FeedParserDict
 from scrapy.http import HtmlResponse
 from sqlmodel import Session
 
 from app.models import CrawledItem, Item
 
 
+def openreview_url(urls):
+    for url in urls[::-1]:
+        if "openreview" in url:
+            return url
+    return urls[0]  # if no openreview url, return the first url
+
+
 class PaperRequestsTask(Task):
     url: str
     ignore_result: bool = True
@@ -91,3 +100,38 @@ def run(self, urls: list[str]):
             results.append((url, item))
 
         self.save(results)
+
+
+class RSSTask(Task):
+    name: str
+    url: str
+    ignore_result: bool = True
+
+    @property
+    def db(self):
+        """
+        Lazy loading of database connection.
+        """
+        from app.db.engine import engine
+
+        return Session(engine)
+
+    @staticmethod
+    def parse(entry) -> dict[str, Any]:
+        raise NotImplementedError
+
+    def post_parse(self, entry: dict[str, Any]) -> dict[str, Any]:
+        return entry
+
+    def run(self):
+        feed: FeedParserDict = feedparser.parse(self.url)
+        results = []
+        for entry in feed.entries:
+            item = self.parse(entry)
+            item = self.post_parse(item)
+
+            if item["title"] is None or item["abstract"] is None:
+                logging.warning(f"Empty title or abstract: {entry.link}")
+                continue
+
+            results.append((entry.link, item))
diff --git a/backend/app/app/worker.py b/backend/app/app/worker.py
index ce31ac5..dac193b 100644
--- a/backend/app/app/worker.py
+++ b/backend/app/app/worker.py
@@ -7,9 +7,9 @@
 from app.core.celery_app import celery_app
 from app.core.config import settings
 from app.models import CrawledItem
-from app.source import Nips
+from app.source import NIPS
 
-celery_app.register_task(Nips())
+celery_app.register_task(NIPS())
 
 
 def batch(iterable: Union[set[str], list[str]], n: int = 1):
@@ -43,7 +43,7 @@ def db(self) -> Session:
     ignore_result=True,
 )
 def test_celery_worker(self: DatabaseTask, word: str) -> None:
-    urls = set(Nips.get_urls())
+    urls = set(NIPS.get_urls())
 
     # remove duplicates from db
     with self.db as db:
@@ -57,4 +57,4 @@ def test_celery_worker(self: DatabaseTask, word: str) -> None:
     logging.info(f"Cache hit rate: {cache_hit_rate * 100:.2f}%")
 
     for url in batch(urls, settings.REQUESTS_BATCH_SIZE):
-        celery_app.send_task("Nips", kwargs={"urls": url})
+        celery_app.send_task("NIPS", kwargs={"urls": url})
diff --git a/backend/app/poetry.lock b/backend/app/poetry.lock
index f028330..4106188 100644
--- a/backend/app/poetry.lock
+++ b/backend/app/poetry.lock
@@ -836,6 +836,20 @@ typing-extensions = ">=4.5.0"
 [package.extras]
 all = ["email-validator (>=2.0.0)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
 
+[[package]]
+name = "feedparser"
+version = "6.0.11"
+description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "feedparser-6.0.11-py3-none-any.whl", hash = "sha256:0be7ee7b395572b19ebeb1d6aafb0028dee11169f1c934e0ed67d54992f4ad45"},
+    {file = "feedparser-6.0.11.tar.gz", hash = "sha256:c9d0407b64c6f2a065d0ebb292c2b35c01050cc0dc33757461aaabdc4c4184d5"},
+]
+
+[package.dependencies]
+sgmllib3k = "*"
+
 [[package]]
 name = "filelock"
 version = "3.13.1"
@@ -2753,6 +2767,16 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments
 testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
 
+[[package]]
+name = "sgmllib3k"
+version = "1.0.0"
+description = "Py3k port of sgmllib."
+optional = false
+python-versions = "*"
+files = [
+    {file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"},
+]
+
 [[package]]
 name = "six"
 version = "1.16.0"
@@ -3660,4 +3684,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "f0642aa00fc65a89ca38becf3cd795d97cba3a89fdaee47d696f38004d779644"
+content-hash = "f5f91fbf95ff3897b2922797fd22c9c7c8853b8ca9f9e22baee985a61d70495f"
diff --git a/backend/app/pyproject.toml b/backend/app/pyproject.toml
index a0fb249..b12db50 100644
--- a/backend/app/pyproject.toml
+++ b/backend/app/pyproject.toml
@@ -29,6 +29,7 @@ celery = "^5.3.6"
 toml = "^0.10.2"
 psycopg = { extras = ["binary"], version = "^3.1.13" }
 scrapy = "^2.11.0"
+feedparser = "^6.0.11"
 
 
 [tool.poetry.dev-dependencies]