Skip to content

Commit

Permalink
Completely rewrite news.py and unit tests
Browse files Browse the repository at this point in the history
Rewrite the news module because the underlying news API has been dead
for years. The rewritten news.py scrapes the Pittwire website for
official Pitt news articles. The rewritten unit tests for news.py mocks
API requests and achieves 100% code coverage with pytest.
  • Loading branch information
tianyizheng02 committed Aug 20, 2024
1 parent 6d6b1ef commit d0ed3e5
Show file tree
Hide file tree
Showing 6 changed files with 6,257 additions and 91 deletions.
157 changes: 105 additions & 52 deletions pittapi/news.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,56 +17,109 @@
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""

import math
import requests
import grequests
from typing import Any

sess = requests.session()


def _load_n_items(feed: str, max_news_items: int):
payload = {
"feed": feed,
"id": "",
"_object": "kgoui_Rcontent_I0_Rcontent_I0",
"start": 0,
}

request_objs = []
for i in range(int(math.ceil(max_news_items / 10))):
payload["start"] = i * 10
request_objs.append(grequests.get("https://m.pitt.edu/news/index.json", params=payload))

responses = grequests.imap(request_objs)

return responses
from __future__ import annotations


def get_news(feed: str = "main_news", max_news_items: int = 10) -> list[dict[str, Any]]:
# feed indicates the desired news feed
# 'main_news' - main news
# 'cssd' - student announcements, on my pitt
# 'news_chronicle' - the Pitt Chronicle news
# 'news_alerts' - crime alerts

news = []

resps = _load_n_items(feed, max_news_items)
resps = [r.json()["response"]["regions"][0]["contents"] for r in resps]

for resp in resps:
for data in resp:
fields = data["fields"]
if fields["type"] == "loadMore":
continue

# TODO: Look into why this gives a Type Error during the news alert test.
try:
title = fields["title"]
url = "https://m.pitt.edu" + fields["url"]["formatted"]
news.append({"title": title, "url": url})
except TypeError:
continue

return news[:max_news_items]
import math
from requests_html import Element, HTMLResponse, HTMLSession
from typing import Literal, NamedTuple

NUM_ARTICLES_PER_PAGE = 20

NEWS_BY_CATEGORY_URL = (
"https://www.pitt.edu/pittwire/news/{category}?field_topics_target_id={topic_id}&field_article_date_value={year}"
"&title={query}&field_category_target_id=All&page={page_num}"
)
PITT_BASE_URL = "https://www.pitt.edu"

Category = Literal["features-articles", "accolades-honors", "ones-to-watch", "announcements-and-updates"]
Topic = Literal[
"university-news",
"health-and-wellness",
"technology-and-science",
"arts-and-humanities",
"community-impact",
"innovation-and-research",
"global",
"diversity-equity-and-inclusion",
"our-city-our-campus",
"teaching-and-learning",
"space",
"ukraine",
"sustainability",
]

TOPIC_ID_MAP: dict[Topic, int] = {
"university-news": 432,
"health-and-wellness": 2,
"technology-and-science": 391,
"arts-and-humanities": 4,
"community-impact": 6,
"innovation-and-research": 1,
"global": 9,
"diversity-equity-and-inclusion": 8,
"our-city-our-campus": 12,
"teaching-and-learning": 7,
"space": 440,
"ukraine": 441,
"sustainability": 470,
}

sess = HTMLSession()


class Article(NamedTuple):
title: str
description: str
url: str
tags: list[str]

@classmethod
def from_html(cls, article_html: Element) -> Article:
article_heading: Element = article_html.find("h2.news-card-title a", first=True)
article_subheading: Element = article_html.find("p", first=True)
article_tags_list: list[Element] = article_html.find("ul.news-card-tags li")

article_title = article_heading.text.strip()
article_url = PITT_BASE_URL + article_heading.attrs["href"]
article_description = article_subheading.text.strip()
article_tags = [tag.text.strip() for tag in article_tags_list]

return cls(title=article_title, description=article_description, url=article_url, tags=article_tags)


def _get_page_articles(
topic: Topic,
category: Category,
query: str,
year: int | None,
page_num: int,
) -> list[Article]:
year_str = str(year) if year else ""
page_num_str = str(page_num) if page_num else ""
response: HTMLResponse = sess.get(
NEWS_BY_CATEGORY_URL.format(
category=category, topic_id=TOPIC_ID_MAP[topic], year=year_str, query=query, page_num=page_num_str
)
)
main_content: Element = response.html.xpath("/html/body/div/main/div/section", first=True)
news_cards: list[Element] = main_content.find("div.news-card")
page_articles = [Article.from_html(news_card) for news_card in news_cards]
return page_articles


def get_articles_by_topic(
topic: Topic,
category: Category = "features-articles",
query: str = "",
year: int | None = None,
max_num_results: int = NUM_ARTICLES_PER_PAGE,
) -> list[Article]:
num_pages = math.ceil(max_num_results / NUM_ARTICLES_PER_PAGE)

# Get articles sequentially and synchronously (i.e., not using grequests) because the news pages must stay in order
results: list[Article] = []
for page_num in range(num_pages): # Page numbers in url are 0-indexed
page_articles = _get_page_articles(topic, category, query, year, page_num)
num_articles_to_add = min(len(page_articles), max_num_results - len(results))
results.extend(page_articles[:num_articles_to_add])
return results
Loading

0 comments on commit d0ed3e5

Please sign in to comment.