-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[TagesspiegelBridge] Add bridge for tagesspiegel.de (#4270)
* [TagesspiegelBridge] Add bridge for tagesspiegel.de * [TagesspiegelBridge] Raise timtout to 60min
- Loading branch information
1 parent
082542d
commit 29d984c
Showing
1 changed file
with
221 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,221 @@ | ||
<?php | ||
|
||
class TagesspiegelBridge extends FeedExpander | ||
{ | ||
const MAINTAINER = 'AlexanderS'; | ||
const NAME = 'Tagesspiegel Bridge'; | ||
const URI = 'https://www.tagesspiegel.de/'; | ||
const CACHE_TIMEOUT = 3600; // 60min | ||
const DESCRIPTION = 'Returns the full articles instead of only the intro'; | ||
const PARAMETERS = [[ | ||
'category' => [ | ||
'name' => 'Category', | ||
'type' => 'list', | ||
'values' => [ | ||
'Startseite' | ||
=> 'https://tagesspiegel.de/contentexport/feed', | ||
'Plus' | ||
=> 'https://tagesspiegel.de/contentexport/feed/plus/', | ||
'Politik' | ||
=> 'https://tagesspiegel.de/contentexport/feed/politik/', | ||
'Internationales' | ||
=> 'https://tagesspiegel.de/contentexport/feed/internationales/', | ||
'Berlin' | ||
=> 'https://tagesspiegel.de/contentexport/feed/berlin/', | ||
'Berlin - Bezirke' | ||
=> 'https://tagesspiegel.de/contentexport/feed/berlin/bezirke/', | ||
'Berlin - Berliner Wirtschaft' | ||
=> 'https://tagesspiegel.de/contentexport/feed/berlin/berliner-wirtschaft/', | ||
'Berlin - Berliner Sport' | ||
=> 'https://tagesspiegel.de/contentexport/feed/berlin/berliner_sport/', | ||
'Berlin - Polizei & Justiz' | ||
=> 'https://tagesspiegel.de/contentexport/feed/berlin/polizei-justiz/', | ||
'Berlin - Stadtleben' | ||
=> 'https://tagesspiegel.de/contentexport/feed/berlin/stadtleben/', | ||
'Berlin - Schule' | ||
=> 'https://tagesspiegel.de/contentexport/feed/berlin/schule/', | ||
'Gesellschaft' | ||
=> 'https://tagesspiegel.de/contentexport/feed/gesellschaft/', | ||
'Gesellschaft - Liebe & Partnerschaft' | ||
=> 'https://tagesspiegel.de/contentexport/feed/gesellschaft/liebe-partnerschaft/', | ||
'Gesellschaft - Queer' | ||
=> 'https://tagesspiegel.de/contentexport/feed/gesellschaft/queerspiegel/', | ||
'Gesellschaft - Panorama' | ||
=> 'https://tagesspiegel.de/contentexport/feed/gesellschaft/panorama/', | ||
'Gesellschaft - Medien' | ||
=> 'https://tagesspiegel.de/contentexport/feed/gesellschaft/medien/', | ||
'Gesellschaft - Geschichte' | ||
=> 'https://tagesspiegel.de/contentexport/feed/gesellschaft/geschichte/', | ||
'Gesellschaft - Reise' | ||
=> 'https://tagesspiegel.de/contentexport/feed/gesellschaft/reise/', | ||
'Wirtschaft' | ||
=> 'https://tagesspiegel.de/contentexport/feed/wirtschaft/', | ||
'Wirtschaft - Immobilien' | ||
=> 'https://tagesspiegel.de/contentexport/feed/wirtschaft/immobilien/', | ||
'Wirtschaft - Jobs & Karriere' | ||
=> 'https://tagesspiegel.de/contentexport/feed/wirtschaft/karriere/', | ||
'Wirtschaft - Finanzen' | ||
=> 'https://tagesspiegel.de/contentexport/feed/wirtschaft/finanzen/', | ||
'Wirtschaft - Mobilität' | ||
=> 'https://tagesspiegel.de/contentexport/feed/wirtschaft/mobilitaet/', | ||
'Kultur' | ||
=> 'https://tagesspiegel.de/contentexport/feed/kultur/', | ||
'Kultur - Literatur' | ||
=> 'https://tagesspiegel.de/contentexport/feed/kultur/literatur/', | ||
'Kultur - Comics' | ||
=> 'https://tagesspiegel.de/contentexport/feed/kultur/comics/', | ||
'Kultur - Kino' | ||
=> 'https://tagesspiegel.de/contentexport/feed/kultur/kino/', | ||
'Kultur - Pop' | ||
=> 'https://tagesspiegel.de/contentexport/feed/kultur/pop/', | ||
'Kultur - Ausstellungen' | ||
=> 'https://tagesspiegel.de/contentexport/feed/kultur/ausstellungen/', | ||
'Kultur - Bühne' | ||
=> 'https://tagesspiegel.de/contentexport/feed/kultur/buehne/', | ||
'Wissen' | ||
=> 'https://tagesspiegel.de/contentexport/feed/wissen/', | ||
'Gesundheit' | ||
=> 'https://tagesspiegel.de/contentexport/feed/gesundheit/', | ||
'Sport' | ||
=> 'https://tagesspiegel.de/contentexport/feed/sport/', | ||
'Meinung' | ||
=> 'https://tagesspiegel.de/contentexport/feed/meinung/', | ||
'Meinung - Kolumnen' | ||
=> 'https://tagesspiegel.de/contentexport/feed/meinung/kolumnen/', | ||
'Meinung - Lesermeinung' | ||
=> 'https://tagesspiegel.de/contentexport/feed/meinung/lesermeinung/', | ||
'Potsdam' | ||
=> 'https://tagesspiegel.de/contentexport/feed/potsdam/', | ||
'Potsdam - Landeshauptstadt' | ||
=> 'https://tagesspiegel.de/contentexport/feed/potsdam/landeshauptstadt/', | ||
'Potsdam - Potsdam-Mittelmark' | ||
=> 'https://tagesspiegel.de/contentexport/feed/potsdam/potsdam-mittelmark/', | ||
'Potsdam - Brandenburg' | ||
=> 'https://tagesspiegel.de/contentexport/feed/potsdam/brandenburg/', | ||
'Potsdam - Kultur' | ||
=> 'https://tagesspiegel.de/contentexport/feed/potsdam/potsdam-kultur/', | ||
'Podcasts' | ||
=> 'https://tagesspiegel.de/contentexport/feed/podcasts/', | ||
] | ||
], | ||
'limit' => [ | ||
'name' => 'Limit', | ||
'type' => 'number', | ||
'required' => false, | ||
'title' => 'Specify number of full articles to return', | ||
'defaultValue' => 5 | ||
] | ||
]]; | ||
|
||
public function collectData() | ||
{ | ||
$url = $this->getInput('category'); | ||
$limit = $this->getInput('limit') ?: 5; | ||
|
||
$this->collectExpandableDatas($url, $limit); | ||
} | ||
|
||
protected function parseItem(array $item) | ||
{ | ||
$item['enclosures'] = []; | ||
|
||
$article = getSimpleHTMLDOM($item['uri']); | ||
$item = $this->parseArticle($item, $article); | ||
|
||
return $item; | ||
} | ||
|
||
private function parseArticle($item, $article) | ||
{ | ||
$item['categories'] = []; | ||
|
||
// Add tag for articles only available with "Tagesspiegel Plus" | ||
$plusicon = $article->find('span[data-ob="plus"]', 0); | ||
if ($plusicon) { | ||
$item['categories'][] = 'Tagesspiegel Plus'; | ||
} | ||
|
||
// Add section from breadcrumbs as tags | ||
$breadcrumbs = $article->find('ol[property="breadcrumb"]', 0); | ||
$names = $breadcrumbs->find('span[property="name"]'); | ||
$names = array_slice($names, 1, -1); | ||
foreach ($names as $name) { | ||
$item['categories'][] = trim($name->plaintext); | ||
} | ||
|
||
// Get categories from article | ||
$home_link = $article->find('a[data-gtm-class="article-home-link"]', 0); | ||
if ($home_link) { | ||
$tag_container = $home_link->parent->nextSibling(); | ||
if ($tag_container) { | ||
$tags = $tag_container->find('li'); | ||
|
||
if ($tags) { | ||
foreach ($tags as $tag) { | ||
$item['categories'][] = trim($tag->plaintext); | ||
} | ||
} | ||
} | ||
} | ||
|
||
$article = $article->find('article', 0); | ||
|
||
// Remove known bad elements | ||
foreach ( | ||
$article->find( | ||
'script, aside, nav, dl.debug-piano, .link--external svg, time, a[data-gtm-class="article-home-link"]' | ||
) as $bad | ||
) { | ||
$bad->remove(); | ||
} | ||
|
||
// Remove references to external content (requires javascript for consent) | ||
foreach ($article->find('p') as $par) { | ||
if ($par->plaintext == 'Empfohlener redaktioneller Inhalt') { | ||
$par->parent->parent->parent->parent->remove(); | ||
} | ||
} | ||
|
||
// Reload html, as remove() is buggy | ||
$article = str_get_html($article->outertext); | ||
|
||
|
||
// Clean article content | ||
$elements = $article->find('h3, p, figure, blockquote'); | ||
foreach ($elements as $i => $element) { | ||
foreach ($element->find('img, picture source') as $img) { | ||
// Add URI to src | ||
if ($img->hasAttribute('src')) { | ||
if (str_starts_with($img->attr['src'], '/')) { | ||
$img->attr['src'] = urljoin(self::URI, $img->attr['src']); | ||
} | ||
} | ||
|
||
// Add URI to srcset | ||
if ($img->hasAttribute('srcset')) { | ||
$srcsets = explode(',', $img->attr['srcset']); | ||
foreach ($srcsets as &$srcset) { | ||
$parts = explode(' ', trim($srcset)); | ||
if (count($parts) > 0) { | ||
if (str_starts_with($parts[0], '/')) { | ||
$parts[0] = urljoin(self::URI, $parts[0]); | ||
} | ||
} | ||
$srcset = implode(' ', $parts); | ||
} | ||
$img->attr['srcset'] = implode(', ', $srcsets); | ||
} | ||
} | ||
|
||
// Remove paragraphs that are already included in other elements | ||
if ($element->tag == 'p') { | ||
if ($element->parent->tag == 'blockquote' || $element->parent->tag == 'figure') { | ||
unset($elements[$i]); | ||
} | ||
} | ||
} | ||
$item['content'] = implode('', $elements); | ||
|
||
return $item; | ||
} | ||
} |