Skip to content

Commit

Permalink
Update batch size for requests
Browse files Browse the repository at this point in the history
  • Loading branch information
AndPuQing committed Jan 20, 2024
1 parent eb02e98 commit a91a807
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 30 deletions.
2 changes: 1 addition & 1 deletion backend/app/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class Settings(BaseSettings):
EMAIL_TEMPLATES_DIR: str = "bemore/email-templates/build"

# requests settings
REQUESTS_BATCH_SIZE: int = 10
REQUESTS_BATCH_SIZE: int = 2

model_config = SettingsConfigDict(
env_file=".env",
Expand Down
19 changes: 14 additions & 5 deletions backend/app/app/source/NIPS.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,25 @@
from scrapy.http import HtmlResponse

from app.source.base import PaperRequests
from app.source.base import PaperRequestsTask


class Nips(PaperRequests):
class Nips(PaperRequestsTask):
def __init__(self):
url: str = "https://nips.cc/Conferences/2023/Schedule?type=Poster"
super().__init__(url)

@staticmethod
def get_urls(response: HtmlResponse):
return response.css("div::attr(onclick)").getall()
def get_urls(response: HtmlResponse) -> list[str]:
poster_ids = response.css(".maincard::attr(id)").getall()
urls = [
f"https://nips.cc/Conferences/2023/Schedule?showEvent={poster_id.replace('maincard_', '')}"
for poster_id in poster_ids
]
return urls

@staticmethod
def parse(response: HtmlResponse):
yield {
return {
"type": response.css("div.maincardType::text").get(),
"title": response.css("div.maincardBody::text").get(),
"authors": response.css("div.maincardFooter::text").get(),
Expand Down
36 changes: 16 additions & 20 deletions backend/app/app/source/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,24 @@
from abc import abstractmethod

import requests
from celery import Task
from scrapy.http import HtmlResponse

from app.core.celery_app import celery_app
from app.core.config import settings


class PaperRequests:
class PaperRequestsTask(Task):
def __init__(self, url):
self.url = url

@abstractmethod
def get_urls(self, response: HtmlResponse) -> list:
@staticmethod
def get_urls(response: HtmlResponse) -> list[str]:
# you should return list of absolute urls
raise NotImplementedError

@staticmethod
@abstractmethod
@celery_app.task(acks_late=True)
def parse(self, response: HtmlResponse) -> None:
def parse(response: HtmlResponse) -> dict[str, str]:
# you should return dict with fields:
raise NotImplementedError

@staticmethod
Expand All @@ -31,16 +32,11 @@ def request(url: str) -> HtmlResponse | None:
return
return HtmlResponse(url=url, body=response.content, encoding="utf-8")

@staticmethod
def batch_urls(urls: list, batch_size: int = settings.REQUESTS_BATCH_SIZE):
for i in range(0, len(urls), batch_size):
yield urls[i : i + batch_size]

def run(self):
self.response = self.request(self.url)
if self.response is None:
return
urls = self.get_urls(self.response)
logging.info(f"From {self.url} found {len(urls)} urls")
for batch in self.batch_urls(urls):
self.parse.delay(batch)
def run(self, urls: list[str]):
results = []
for url in urls:
response = PaperRequestsTask.request(url)
if response is None:
continue
results.append(self.parse(response))
return results
12 changes: 8 additions & 4 deletions backend/app/app/worker.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import logging

from app.core.celery_app import celery_app
from app.source.NIPS import Nips


def run_paper_requests_task(source: str):
pass


@celery_app.task(acks_late=True)
def test_celery_worker(word: str) -> None:
print(f"word: {word}")
Nips("https://nips.cc/Conferences/2023/Schedule?type=Poster").run()
print("done")
logging.info("Celery worker is working")
logging.info(f"DONE: {word}")

0 comments on commit a91a807

Please sign in to comment.