Skip to content

Commit

Permalink
Add ICLR and ICML sources, and feedparser dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
AndPuQing committed Jan 27, 2024
1 parent b361ce6 commit 8e26e4c
Show file tree
Hide file tree
Showing 10 changed files with 161 additions and 17 deletions.
4 changes: 4 additions & 0 deletions backend/app/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ class Item(ItemBase, table=True):
sa_column=Column(JSON),
)
from_source: str = Field(nullable=False)
category: Union[list[str], None] = Field(
default=None,
sa_column=Column(JSON),
)


# Properties to return via API, id is always required
Expand Down
64 changes: 64 additions & 0 deletions backend/app/app/source/Arxiv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import logging
import re
from typing import Any

from scrapy.http import HtmlResponse

from app.source.base import PaperRequestsTask, RSSTask


class Arxiv(RSSTask):
url: str = "http://export.arxiv.org/rss/cs"
name: str = "Arxiv"
_cache_category_map: dict[str, str] = {}

@staticmethod
def parse(entry: dict) -> dict[str, Any]:
return {
"title": entry["title"],
"authors": entry["author"],
"url": entry["link"],
"abstract": entry["summary"],
}

@property
def category_map(self):
if not self._cache_category_map:
response = PaperRequestsTask._request(
"https://arxiv.org/category_taxonomy",
)
if response is None:
return {}
response = HtmlResponse(
url="",
body=response.text,
encoding="utf-8",
)
category = response.css("h4::text").getall()
full_name = response.css("span::text").getall()
for i, c in enumerate(category):
self._cache_category_map[c] = (
full_name[i].replace("(", "").replace(")", "")
)

return self._cache_category_map

def post_parse(self, entry: dict[str, Any]) -> dict[str, Any]:
entry["title"] = entry["title"].split("(", 1)[0]
entry["authors"] = (
HtmlResponse(url="", body=entry["authors"], encoding="utf-8")
.css("a::text")
.getall()
)
entry["abstract"] = (
HtmlResponse(url="", body=entry["abstract"], encoding="utf-8")
.css("p::text")
.get()
)
category = re.findall(r"\[(.*?)\]", entry["title"])[0]
if category in Arxiv.category_map:
entry["category"] = self.category_map[category]
else:
logging.warning(f"Unknown category: {category}")
entry["category"] = None
return entry
6 changes: 6 additions & 0 deletions backend/app/app/source/ICLR.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from app.source import NIPS


class ICLR(NIPS):
url: str = "https://iclr.cc/Conferences/2023/Schedule?type=Poster"
name: str = "ICLR"
6 changes: 6 additions & 0 deletions backend/app/app/source/ICML.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from app.source import NIPS


class ICML(NIPS):
url: str = "https://icml.cc/Conferences/2023/Schedule?type=Poster"
name: str = "ICML"
13 changes: 3 additions & 10 deletions backend/app/app/source/NIPS.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

from scrapy.http import HtmlResponse

from app.source.base import PaperRequestsTask
from app.source.base import PaperRequestsTask, openreview_url


class Nips(PaperRequestsTask):
class NIPS(PaperRequestsTask):
url: str = "https://nips.cc/Conferences/2023/Schedule?type=Poster"
name: str = "Nips"
name: str = "NIPS"

@staticmethod
def parse_urls(response: HtmlResponse) -> list[str]:
Expand Down Expand Up @@ -41,10 +41,3 @@ def post_parse(item: dict[str, Any]) -> dict[str, Any]:
for i, author in enumerate(item["authors"]):
item["authors"][i] = author.strip()
return item


def openreview_url(urls):
for url in urls[::-1]:
if "openreview" in url:
return url
return urls[0] # if no openreview url, return the first url
6 changes: 4 additions & 2 deletions backend/app/app/source/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from app.source.base import PaperRequestsTask
from app.source.NIPS import Nips
from app.source.ICLR import ICLR
from app.source.ICML import ICML
from app.source.NIPS import NIPS

__all__ = ["PaperRequestsTask", "Nips"]
__all__ = ["PaperRequestsTask", "NIPS", "ICLR", "ICML"]
44 changes: 44 additions & 0 deletions backend/app/app/source/base.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
import logging
from typing import Any

import feedparser
import requests
from celery import Task
from feedparser import FeedParserDict
from scrapy.http import HtmlResponse
from sqlmodel import Session

from app.models import CrawledItem, Item


def openreview_url(urls):
for url in urls[::-1]:
if "openreview" in url:
return url
return urls[0] # if no openreview url, return the first url


class PaperRequestsTask(Task):
url: str
ignore_result: bool = True
Expand Down Expand Up @@ -91,3 +100,38 @@ def run(self, urls: list[str]):
results.append((url, item))

self.save(results)


class RSSTask(Task):
name: str
url: str
ignore_result: bool = True

@property
def db(self):
"""
Lazy loading of database connection.
"""
from app.db.engine import engine

return Session(engine)

@staticmethod
def parse(entry) -> dict[str, Any]:
raise NotImplementedError

def post_parse(self, entry: dict[str, Any]) -> dict[str, Any]:
return entry

def run(self):
feed: FeedParserDict = feedparser.parse(self.url)
results = []
for entry in feed.entries:
item = self.parse(entry)
item = self.post_parse(item)

if item["title"] is None or item["abstract"] is None:
logging.warning(f"Empty title or abstract: {entry.link}")
continue

results.append((entry.link, item))
8 changes: 4 additions & 4 deletions backend/app/app/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
from app.core.celery_app import celery_app
from app.core.config import settings
from app.models import CrawledItem
from app.source import Nips
from app.source import NIPS

celery_app.register_task(Nips())
celery_app.register_task(NIPS())


def batch(iterable: Union[set[str], list[str]], n: int = 1):
Expand Down Expand Up @@ -43,7 +43,7 @@ def db(self) -> Session:
ignore_result=True,
)
def test_celery_worker(self: DatabaseTask, word: str) -> None:
urls = set(Nips.get_urls())
urls = set(NIPS.get_urls())

# remove duplicates from db
with self.db as db:
Expand All @@ -57,4 +57,4 @@ def test_celery_worker(self: DatabaseTask, word: str) -> None:
logging.info(f"Cache hit rate: {cache_hit_rate * 100:.2f}%")

for url in batch(urls, settings.REQUESTS_BATCH_SIZE):
celery_app.send_task("Nips", kwargs={"urls": url})
celery_app.send_task("NIPS", kwargs={"urls": url})
26 changes: 25 additions & 1 deletion backend/app/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions backend/app/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ celery = "^5.3.6"
toml = "^0.10.2"
psycopg = { extras = ["binary"], version = "^3.1.13" }
scrapy = "^2.11.0"
feedparser = "^6.0.11"


[tool.poetry.dev-dependencies]
Expand Down

0 comments on commit 8e26e4c

Please sign in to comment.