From b81249383f0ff54fb41bde502e0f6c8627c81fa9 Mon Sep 17 00:00:00 2001 From: Tughan Belbek Date: Fri, 25 Oct 2024 13:34:57 +0200 Subject: [PATCH] Add Duvar.org bridge for scraping news articles --- bridges/DuvarOrgBridge.php | 87 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 bridges/DuvarOrgBridge.php diff --git a/bridges/DuvarOrgBridge.php b/bridges/DuvarOrgBridge.php new file mode 100644 index 00000000000..85f0500ddfa --- /dev/null +++ b/bridges/DuvarOrgBridge.php @@ -0,0 +1,87 @@ + [ + 'name' => 'Limit', + 'type' => 'number', + 'required' => true, + 'title' => 'Maximum number of items to return', + 'defaultValue' => 20, + ], + 'urlsuffix' => [ + 'name' => 'URL Suffix', + 'type' => 'list', + 'required' => true, + 'title' => 'Suffix for the URL to scrape a specific section', + 'defaultValue' => '', + 'values' => [ + 'Main' => '', + 'Balanced' => '/uyumlu', + 'Protest' => '/muhalif', + 'Center' => '/merkez', + 'Alternative' => '/alternatif', + 'Global' => '/global', + ], + ], + ]]; + + public function collectData() + { + $postCount = $this->getInput('postcount'); + $urlSuffix = $this->getInput('urlsuffix'); + $url = self::URI . $urlSuffix; + $html = getSimpleHTMLDOM($url); + + foreach ($html->find('article.news-item') as $data) { + if ($data === null) { + continue; + } + + try { + $item = []; + $linkElement = $data->find('h2.news-title a', 0); + $titleElement = $data->find('h2.news-title a', 0); + $timestampElement = $data->find('time.meta-tag.date-tag', 0); + $contentElement = $data->find('div.news-description', 0); + + if ($linkElement) { + $item['uri'] = $linkElement->getAttribute('href'); + } else { + continue; + } + if ($titleElement) { + $item['title'] = trim($titleElement->plaintext); + } else { + continue; + } + if ($timestampElement) { + $item['timestamp'] = strtotime($timestampElement->plaintext); + } else { + $item['timestamp'] = time(); + } + if ($contentElement) { + $item['content'] = trim($contentElement->plaintext); + } else { + $item['content'] = ''; + } + $item['uid'] = hash('sha256', $item['title']); + + $this->items[] = $item; + + if (count($this->items) >= $postCount) { + break; + } + } catch (Exception $e) { + continue; + } + } + } +} \ No newline at end of file