From fb66775ecee9db86c6be55c28eeae1c8f50e4cee Mon Sep 17 00:00:00 2001 From: Niehztog Date: Tue, 2 Apr 2024 23:14:25 +0200 Subject: [PATCH] [XPathAbstract] Refactor xpath abstract (#4047) * refactor XPathAbstract, keep all functionality intact * fix linter errors * further simplify code * set default value for raw item content to true, avoiding escaping of html tags in feed item contents by default --- bridges/BlizzardNewsBridge.php | 2 +- lib/XPathAbstract.php | 125 ++++++++++++--------------------- 2 files changed, 47 insertions(+), 80 deletions(-) diff --git a/bridges/BlizzardNewsBridge.php b/bridges/BlizzardNewsBridge.php index 19c38152cc0..993492d404b 100644 --- a/bridges/BlizzardNewsBridge.php +++ b/bridges/BlizzardNewsBridge.php @@ -37,7 +37,7 @@ class BlizzardNewsBridge extends XPathAbstract const XPATH_EXPRESSION_ITEM = '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article'; const XPATH_EXPRESSION_ITEM_TITLE = './/div/div[2]/h2'; - const XPATH_EXPRESSION_ITEM_CONTENT = './/div[@class="ArticleListItem-description"]/div[@class="h6"]'; + const XPATH_EXPRESSION_ITEM_CONTENT = './/div[@class="ArticleListItem-description"]/div[@class="h6"]/text()'; const XPATH_EXPRESSION_ITEM_URI = './/a[@class="ArticleLink ArticleLink"]/@href'; const XPATH_EXPRESSION_ITEM_AUTHOR = ''; const XPATH_EXPRESSION_ITEM_TIMESTAMP = './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp'; diff --git a/lib/XPathAbstract.php b/lib/XPathAbstract.php index 224d8e875a1..6163ca132d6 100644 --- a/lib/XPathAbstract.php +++ b/lib/XPathAbstract.php @@ -76,15 +76,6 @@ abstract class XPathAbstract extends BridgeAbstract */ const XPATH_EXPRESSION_ITEM_CONTENT = ''; - /** - * Use raw item content - * Whether to use the raw item content or to replace certain characters with - * special significance in HTML by HTML entities (using the PHP function htmlspecialchars). - * - * Use {@see XPathAbstract::getSettingUseRawItemContent()} to read this parameter - */ - const SETTING_USE_RAW_ITEM_CONTENT = false; - /** * XPath expression for extracting an item link from the item context * This expression should match a node's attribute containing the article URL @@ -158,6 +149,15 @@ abstract class XPathAbstract extends BridgeAbstract */ const SETTING_FIX_ENCODING = false; + /** + * Use raw item content + * Whether to use the raw item content or to replace certain characters with + * special significance in HTML by HTML entities (using the PHP function htmlspecialchars). + * + * Use {@see XPathAbstract::getSettingUseRawItemContent()} to read this parameter + */ + const SETTING_USE_RAW_ITEM_CONTENT = true; + /** * Internal storage for resulting feed name, automatically detected * @var string @@ -245,15 +245,6 @@ protected function getExpressionItemContent() return static::XPATH_EXPRESSION_ITEM_CONTENT; } - /** - * Use raw item content - * @return bool - */ - protected function getSettingUseRawItemContent(): bool - { - return static::SETTING_USE_RAW_ITEM_CONTENT; - } - /** * XPath expression for extracting an item link from the item context * @return string @@ -309,6 +300,15 @@ protected function getSettingFixEncoding(): bool return static::SETTING_FIX_ENCODING; } + /** + * Use raw item content + * @return bool + */ + protected function getSettingUseRawItemContent(): bool + { + return static::SETTING_USE_RAW_ITEM_CONTENT; + } + /** * Internal helper method for quickly accessing all the user defined constants * in derived classes @@ -331,8 +331,6 @@ private function getParam($name) return $this->getExpressionItemTitle(); case 'content': return $this->getExpressionItemContent(); - case 'raw_content': - return $this->getSettingUseRawItemContent(); case 'uri': return $this->getExpressionItemUri(); case 'author': @@ -345,6 +343,8 @@ private function getParam($name) return $this->getExpressionItemCategories(); case 'fix_encoding': return $this->getSettingFixEncoding(); + case 'raw_content': + return $this->getSettingUseRawItemContent(); } } @@ -438,9 +438,15 @@ public function collectData() continue; } - $isContent = $param === 'content'; - $isCategories = 'categories' === $param; - $value = $this->getItemValueOrNodeValue($typedResult, $isContent, $isContent && !$this->getSettingUseRawItemContent(), $isCategories); + if ('categories' === $param && $typedResult instanceof \DOMNodeList) { + $value = []; + foreach ($typedResult as $domNode) { + $value[] = $this->getItemValueOrNodeValue($domNode, false); + } + } else { + $value = $this->getItemValueOrNodeValue($typedResult, 'content' === $param); + } + $item->__set($param, $this->formatParamValue($param, $value)); } @@ -460,6 +466,7 @@ public function collectData() */ protected function formatParamValue($param, $value) { + $value = is_array($value) ? array_map('trim', $value) : trim($value); $value = is_array($value) ? array_map([$this, 'fixEncoding'], $value) : $this->fixEncoding($value); switch ($param) { case 'title': @@ -503,7 +510,7 @@ protected function formatItemTitle($value) */ protected function formatItemContent($value) { - return $value; + return $this->getParam('raw_content') ? $value : htmlspecialchars($value); } /** @@ -599,68 +606,28 @@ protected function cleanMediaUrl($mediaUrl) * @param $typedResult * @param bool $returnXML * @param bool $escapeHtml - * @param bool $allowMultiple - * @return string|array + * @return string * @throws Exception */ - protected function getItemValueOrNodeValue($typedResult, $returnXML = false, $escapeHtml = false, $allowMultiple = false) + protected function getItemValueOrNodeValue($typedResult, $returnXML = false) { - if ($typedResult instanceof \DOMNodeList && !$allowMultiple) { - $item = $typedResult->item(0); - $text = $this->extractNodeListContent($item, $returnXML); - } elseif ($typedResult instanceof \DOMNodeList && $allowMultiple) { - $text = []; - foreach ($typedResult as $item) { - $text[] = $this->extractNodeListContent($item, $returnXML); - } - } elseif (is_string($typedResult) && strlen($typedResult) > 0) { - $text = $typedResult; - } else { - throw new \Exception('Unknown type of XPath expression result.'); - } - - if (is_array($text)) { - foreach ($text as &$element) { - $element = $this->cleanExtractedText($element, $escapeHtml, $returnXML); - } - } else { - $text = $this->cleanExtractedText($text, $escapeHtml, $returnXML); + if ($typedResult instanceof \DOMNodeList) { + $typedResult = $typedResult->item(0); } - return $text; - } - /** - * @param $item - * @param $returnXML - * @return false|string - * @throws Exception - */ - protected function extractNodeListContent($item, $returnXML) - { - if ($item instanceof \DOMElement) { - return $returnXML ? ($item->ownerDocument ?? $item)->saveXML($item) : $item->nodeValue; - } elseif ($item instanceof \DOMAttr) { - return $item->value; - } elseif ($item instanceof \DOMText) { - return $item->wholeText; + if ($typedResult instanceof \DOMElement) { + return $returnXML ? ($typedResult->ownerDocument ?? $typedResult)->saveXML($typedResult) : $typedResult->nodeValue; + } elseif ($typedResult instanceof \DOMAttr) { + return $typedResult->value; + } elseif ($typedResult instanceof \DOMText) { + return $typedResult->wholeText; + } elseif (is_string($typedResult)) { + return $typedResult; + } elseif (null === $typedResult) { + return ''; } - throw new \Exception('Unknown type of XPath expression result.'); - } - /** - * @param $text - * @param $escapeHtml - * @param $returnXML - * @return string - */ - protected function cleanExtractedText($text, $escapeHtml, $returnXML) - { - $text = trim($text); - - if ($escapeHtml && !$returnXML) { - $text = htmlspecialchars($text); - } - return $text; + throw new \Exception('Unknown type of XPath expression result: ' . gettype($typedResult)); } /**