Add ICLR and ICML sources, and feedparser dependency

AndPuQing · Jan 27, 2024 · 8e26e4c · 8e26e4c
1 parent b361ce6
commit 8e26e4c
Show file tree

Hide file tree

Showing 10 changed files with 161 additions and 17 deletions.
diff --git a/backend/app/app/models.py b/backend/app/app/models.py
@@ -88,6 +88,10 @@ class Item(ItemBase, table=True):
         sa_column=Column(JSON),
     )
     from_source: str = Field(nullable=False)
+    category: Union[list[str], None] = Field(
+        default=None,
+        sa_column=Column(JSON),
+    )
 
 
 # Properties to return via API, id is always required

diff --git a/backend/app/app/source/Arxiv.py b/backend/app/app/source/Arxiv.py
@@ -0,0 +1,64 @@
+import logging
+import re
+from typing import Any
+
+from scrapy.http import HtmlResponse
+
+from app.source.base import PaperRequestsTask, RSSTask
+
+
+class Arxiv(RSSTask):
+    url: str = "http://export.arxiv.org/rss/cs"
+    name: str = "Arxiv"
+    _cache_category_map: dict[str, str] = {}
+
+    @staticmethod
+    def parse(entry: dict) -> dict[str, Any]:
+        return {
+            "title": entry["title"],
+            "authors": entry["author"],
+            "url": entry["link"],
+            "abstract": entry["summary"],
+        }
+
+    @property
+    def category_map(self):
+        if not self._cache_category_map:
+            response = PaperRequestsTask._request(
+                "https://arxiv.org/category_taxonomy",
+            )
+            if response is None:
+                return {}
+            response = HtmlResponse(
+                url="",
+                body=response.text,
+                encoding="utf-8",
+            )
+            category = response.css("h4::text").getall()
+            full_name = response.css("span::text").getall()
+            for i, c in enumerate(category):
+                self._cache_category_map[c] = (
+                    full_name[i].replace("(", "").replace(")", "")
+                )
+
+        return self._cache_category_map
+
+    def post_parse(self, entry: dict[str, Any]) -> dict[str, Any]:
+        entry["title"] = entry["title"].split("(", 1)[0]
+        entry["authors"] = (
+            HtmlResponse(url="", body=entry["authors"], encoding="utf-8")
+            .css("a::text")
+            .getall()
+        )
+        entry["abstract"] = (
+            HtmlResponse(url="", body=entry["abstract"], encoding="utf-8")
+            .css("p::text")
+            .get()
+        )
+        category = re.findall(r"\[(.*?)\]", entry["title"])[0]
+        if category in Arxiv.category_map:
+            entry["category"] = self.category_map[category]
+        else:
+            logging.warning(f"Unknown category: {category}")
+            entry["category"] = None
+        return entry
diff --git a/backend/app/app/source/ICLR.py b/backend/app/app/source/ICLR.py
@@ -0,0 +1,6 @@
+from app.source import NIPS
+
+
+class ICLR(NIPS):
+    url: str = "https://iclr.cc/Conferences/2023/Schedule?type=Poster"
+    name: str = "ICLR"
diff --git a/backend/app/app/source/ICML.py b/backend/app/app/source/ICML.py
@@ -0,0 +1,6 @@
+from app.source import NIPS
+
+
+class ICML(NIPS):
+    url: str = "https://icml.cc/Conferences/2023/Schedule?type=Poster"
+    name: str = "ICML"
diff --git a/backend/app/app/source/NIPS.py b/backend/app/app/source/NIPS.py
@@ -2,12 +2,12 @@
 
 from scrapy.http import HtmlResponse
 
-from app.source.base import PaperRequestsTask
+from app.source.base import PaperRequestsTask, openreview_url
 
 
-class Nips(PaperRequestsTask):
+class NIPS(PaperRequestsTask):
     url: str = "https://nips.cc/Conferences/2023/Schedule?type=Poster"
-    name: str = "Nips"
+    name: str = "NIPS"
 
     @staticmethod
     def parse_urls(response: HtmlResponse) -> list[str]:
@@ -41,10 +41,3 @@ def post_parse(item: dict[str, Any]) -> dict[str, Any]:
             for i, author in enumerate(item["authors"]):
                 item["authors"][i] = author.strip()
         return item
-
-
-def openreview_url(urls):
-    for url in urls[::-1]:
-        if "openreview" in url:
-            return url
-    return urls[0]  # if no openreview url, return the first url
diff --git a/backend/app/app/source/__init__.py b/backend/app/app/source/__init__.py
@@ -1,4 +1,6 @@
 from app.source.base import PaperRequestsTask
-from app.source.NIPS import Nips
+from app.source.ICLR import ICLR
+from app.source.ICML import ICML
+from app.source.NIPS import NIPS
 
-__all__ = ["PaperRequestsTask", "Nips"]
+__all__ = ["PaperRequestsTask", "NIPS", "ICLR", "ICML"]
diff --git a/backend/app/app/source/base.py b/backend/app/app/source/base.py
@@ -1,14 +1,23 @@
 import logging
 from typing import Any
 
+import feedparser
 import requests
 from celery import Task
+from feedparser import FeedParserDict
 from scrapy.http import HtmlResponse
 from sqlmodel import Session
 
 from app.models import CrawledItem, Item
 
 
+def openreview_url(urls):
+    for url in urls[::-1]:
+        if "openreview" in url:
+            return url
+    return urls[0]  # if no openreview url, return the first url
+
+
 class PaperRequestsTask(Task):
     url: str
     ignore_result: bool = True
@@ -91,3 +100,38 @@ def run(self, urls: list[str]):
             results.append((url, item))
 
         self.save(results)
+
+
+class RSSTask(Task):
+    name: str
+    url: str
+    ignore_result: bool = True
+
+    @property
+    def db(self):
+        """
+        Lazy loading of database connection.
+        """
+        from app.db.engine import engine
+
+        return Session(engine)
+
+    @staticmethod
+    def parse(entry) -> dict[str, Any]:
+        raise NotImplementedError
+
+    def post_parse(self, entry: dict[str, Any]) -> dict[str, Any]:
+        return entry
+
+    def run(self):
+        feed: FeedParserDict = feedparser.parse(self.url)
+        results = []
+        for entry in feed.entries:
+            item = self.parse(entry)
+            item = self.post_parse(item)
+
+            if item["title"] is None or item["abstract"] is None:
+                logging.warning(f"Empty title or abstract: {entry.link}")
+                continue
+
+            results.append((entry.link, item))
diff --git a/backend/app/app/worker.py b/backend/app/app/worker.py
@@ -7,9 +7,9 @@
 from app.core.celery_app import celery_app
 from app.core.config import settings
 from app.models import CrawledItem
-from app.source import Nips
+from app.source import NIPS
 
-celery_app.register_task(Nips())
+celery_app.register_task(NIPS())
 
 
 def batch(iterable: Union[set[str], list[str]], n: int = 1):
@@ -43,7 +43,7 @@ def db(self) -> Session:
     ignore_result=True,
 )
 def test_celery_worker(self: DatabaseTask, word: str) -> None:
-    urls = set(Nips.get_urls())
+    urls = set(NIPS.get_urls())
 
     # remove duplicates from db
     with self.db as db:
@@ -57,4 +57,4 @@ def test_celery_worker(self: DatabaseTask, word: str) -> None:
     logging.info(f"Cache hit rate: {cache_hit_rate * 100:.2f}%")
 
     for url in batch(urls, settings.REQUESTS_BATCH_SIZE):
-        celery_app.send_task("Nips", kwargs={"urls": url})
+        celery_app.send_task("NIPS", kwargs={"urls": url})
diff --git a/backend/app/poetry.lock b/backend/app/poetry.lock
diff --git a/backend/app/pyproject.toml b/backend/app/pyproject.toml
@@ -29,6 +29,7 @@ celery = "^5.3.6"
 toml = "^0.10.2"
 psycopg = { extras = ["binary"], version = "^3.1.13" }
 scrapy = "^2.11.0"
+feedparser = "^6.0.11"
 
 
 [tool.poetry.dev-dependencies]