Skip to content

Commit

Permalink
Merge pull request #203 from pittcsc/rewrite-news
Browse files Browse the repository at this point in the history
Completely rewrite `news.py` and unit tests
  • Loading branch information
tianyizheng02 authored Aug 25, 2024
2 parents 275fb92 + d0ed3e5 commit e1589e2
Show file tree
Hide file tree
Showing 6 changed files with 6,257 additions and 91 deletions.
157 changes: 105 additions & 52 deletions pittapi/news.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,56 +17,109 @@
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""

import math
import requests
import grequests
from typing import Any

sess = requests.session()


def _load_n_items(feed: str, max_news_items: int):
payload = {
"feed": feed,
"id": "",
"_object": "kgoui_Rcontent_I0_Rcontent_I0",
"start": 0,
}

request_objs = []
for i in range(int(math.ceil(max_news_items / 10))):
payload["start"] = i * 10
request_objs.append(grequests.get("https://m.pitt.edu/news/index.json", params=payload))

responses = grequests.imap(request_objs)

return responses
from __future__ import annotations


def get_news(feed: str = "main_news", max_news_items: int = 10) -> list[dict[str, Any]]:
# feed indicates the desired news feed
# 'main_news' - main news
# 'cssd' - student announcements, on my pitt
# 'news_chronicle' - the Pitt Chronicle news
# 'news_alerts' - crime alerts

news = []

resps = _load_n_items(feed, max_news_items)
resps = [r.json()["response"]["regions"][0]["contents"] for r in resps]

for resp in resps:
for data in resp:
fields = data["fields"]
if fields["type"] == "loadMore":
continue

# TODO: Look into why this gives a Type Error during the news alert test.
try:
title = fields["title"]
url = "https://m.pitt.edu" + fields["url"]["formatted"]
news.append({"title": title, "url": url})
except TypeError:
continue

return news[:max_news_items]
import math
from requests_html import Element, HTMLResponse, HTMLSession
from typing import Literal, NamedTuple

NUM_ARTICLES_PER_PAGE = 20

NEWS_BY_CATEGORY_URL = (
"https://www.pitt.edu/pittwire/news/{category}?field_topics_target_id={topic_id}&field_article_date_value={year}"
"&title={query}&field_category_target_id=All&page={page_num}"
)
PITT_BASE_URL = "https://www.pitt.edu"

Category = Literal["features-articles", "accolades-honors", "ones-to-watch", "announcements-and-updates"]
Topic = Literal[
"university-news",
"health-and-wellness",
"technology-and-science",
"arts-and-humanities",
"community-impact",
"innovation-and-research",
"global",
"diversity-equity-and-inclusion",
"our-city-our-campus",
"teaching-and-learning",
"space",
"ukraine",
"sustainability",
]

TOPIC_ID_MAP: dict[Topic, int] = {
"university-news": 432,
"health-and-wellness": 2,
"technology-and-science": 391,
"arts-and-humanities": 4,
"community-impact": 6,
"innovation-and-research": 1,
"global": 9,
"diversity-equity-and-inclusion": 8,
"our-city-our-campus": 12,
"teaching-and-learning": 7,
"space": 440,
"ukraine": 441,
"sustainability": 470,
}

sess = HTMLSession()


class Article(NamedTuple):
title: str
description: str
url: str
tags: list[str]

@classmethod
def from_html(cls, article_html: Element) -> Article:
article_heading: Element = article_html.find("h2.news-card-title a", first=True)
article_subheading: Element = article_html.find("p", first=True)
article_tags_list: list[Element] = article_html.find("ul.news-card-tags li")

article_title = article_heading.text.strip()
article_url = PITT_BASE_URL + article_heading.attrs["href"]
article_description = article_subheading.text.strip()
article_tags = [tag.text.strip() for tag in article_tags_list]

return cls(title=article_title, description=article_description, url=article_url, tags=article_tags)


def _get_page_articles(
topic: Topic,
category: Category,
query: str,
year: int | None,
page_num: int,
) -> list[Article]:
year_str = str(year) if year else ""
page_num_str = str(page_num) if page_num else ""
response: HTMLResponse = sess.get(
NEWS_BY_CATEGORY_URL.format(
category=category, topic_id=TOPIC_ID_MAP[topic], year=year_str, query=query, page_num=page_num_str
)
)
main_content: Element = response.html.xpath("/html/body/div/main/div/section", first=True)
news_cards: list[Element] = main_content.find("div.news-card")
page_articles = [Article.from_html(news_card) for news_card in news_cards]
return page_articles


def get_articles_by_topic(
topic: Topic,
category: Category = "features-articles",
query: str = "",
year: int | None = None,
max_num_results: int = NUM_ARTICLES_PER_PAGE,
) -> list[Article]:
num_pages = math.ceil(max_num_results / NUM_ARTICLES_PER_PAGE)

# Get articles sequentially and synchronously (i.e., not using grequests) because the news pages must stay in order
results: list[Article] = []
for page_num in range(num_pages): # Page numbers in url are 0-indexed
page_articles = _get_page_articles(topic, category, query, year, page_num)
num_articles_to_add = min(len(page_articles), max_num_results - len(results))
results.extend(page_articles[:num_articles_to_add])
return results
Loading

0 comments on commit e1589e2

Please sign in to comment.