diff --git a/bridges/BlizzardNewsBridge.php b/bridges/BlizzardNewsBridge.php new file mode 100644 index 00000000..156dc290 --- /dev/null +++ b/bridges/BlizzardNewsBridge.php @@ -0,0 +1,60 @@ + array( + 'locale' => array( + 'name' => 'Language', + 'type' => 'list', + 'values' => array( + 'Deutsch' => 'de-de', + 'English (EU)' => 'en-gb', + 'English (US)' => 'en-us', + 'Español (EU)' => 'es-es', + 'Español (AL)' => 'es-mx', + 'Français' => 'fr-fr', + 'Italiano' => 'it-it', + '日本語' => 'ja-jp', + '한국어' => 'ko-kr', + 'Polski' => 'pl-pl', + 'Português (AL)' => 'pt-br', + 'Русский' => 'ru-ru', + 'ภาษาไทย' => 'th-th', + '简体中文' => 'zh-cn', + '繁體中文' => 'zh-tw' + ), + 'defaultValue' => 'en-us', + 'title' => 'Select your language' + ) + ) + ); + const CACHE_TIMEOUT = 3600; + + const XPATH_EXPRESSION_ITEM = '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article'; + const XPATH_EXPRESSION_ITEM_TITLE = './/div/div[2]/h2'; + const XPATH_EXPRESSION_ITEM_CONTENT = './/div[@class="ArticleListItem-description"]/div[@class="h6"]'; + const XPATH_EXPRESSION_ITEM_URI = './/a[@class="ArticleLink ArticleLink"]/@href'; + const XPATH_EXPRESSION_ITEM_AUTHOR = ''; + const XPATH_EXPRESSION_ITEM_TIMESTAMP = './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp'; + const XPATH_EXPRESSION_ITEM_ENCLOSURES = './/div[@class="ArticleListItem-image"]/@style'; + const XPATH_EXPRESSION_ITEM_CATEGORIES = './/div[@class="ArticleListItem-label"]'; + const SETTING_FIX_ENCODING = true; + + /** + * Source Web page URL (should provide either HTML or XML content) + * @return string + */ + protected function getSourceUrl(){ + + $locale = $this->getInput('locale'); + if('zh-cn' === $locale) { + return 'https://cn.news.blizzard.com'; + } + return 'https://news.blizzard.com/' . $locale; + } +} diff --git a/bridges/XPathBridge.php b/bridges/XPathBridge.php new file mode 100644 index 00000000..5aa280e0 --- /dev/null +++ b/bridges/XPathBridge.php @@ -0,0 +1,251 @@ +XPath expressions'; + const MAINTAINER = 'Niehztog'; + const PARAMETERS = array( + '' => array( + + 'url' => array( + 'name' => 'Enter web page URL', + 'title' => <<<"EOL" +You can specify any website URL which serves data suited for display in RSS feeds +(for example a news blog). +EOL + , 'type' => 'text', + 'exampleValue' => 'https://news.blizzard.com/en-en', + 'defaultValue' => 'https://news.blizzard.com/en-en', + 'required' => true + ), + + 'item' => array( + 'name' => 'Item selector', + 'title' => <<<"EOL" +Enter an XPath expression matching a list of dom nodes, each node containing one +feed article item in total (usually a surrounding <div> or <span> tag). This will +be the context nodes for all of the following expressions. This expression usually +starts with a single forward slash. +EOL + , 'type' => 'text', + 'exampleValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article', + 'defaultValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article', + 'required' => true + ), + + 'title' => array( + 'name' => 'Item title selector', + 'title' => <<<"EOL" +This expression should match a node contained within each article item node +containing the article headline. It should start with a dot followed by two +forward slashes, referring to any descendant nodes of the article item node. +EOL + , 'type' => 'text', + 'exampleValue' => './/div/div[2]/h2', + 'defaultValue' => './/div/div[2]/h2', + 'required' => true + ), + + 'content' => array( + 'name' => 'Item description selector', + 'title' => <<<"EOL" +This expression should match a node contained within each article item node +containing the article content or description. It should start with a dot +followed by two forward slashes, referring to any descendant nodes of the +article item node. +EOL + , 'type' => 'text', + 'exampleValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]', + 'defaultValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]', + 'required' => false + ), + + 'uri' => array( + 'name' => 'Item URL selector', + 'title' => <<<"EOL" +This expression should match a node's attribute containing the article URL +(usually the href attribute of an <a> tag). It should start with a dot +followed by two forward slashes, referring to any descendant nodes of +the article item node. Attributes can be selected by prepending an @ char +before the attributes name. +EOL + , 'type' => 'text', + 'exampleValue' => './/a[@class="ArticleLink ArticleLink"]/@href', + 'defaultValue' => './/a[@class="ArticleLink ArticleLink"]/@href', + 'required' => false + ), + + 'author' => array( + 'name' => 'Item author selector', + 'title' => <<<"EOL" +This expression should match a node contained within each article item +node containing the article author's name. It should start with a dot +followed by two forward slashes, referring to any descendant nodes of +the article item node. +EOL + , 'type' => 'text', + 'required' => false + ), + + 'timestamp' => array( + 'name' => 'Item date selector', + 'title' => <<<"EOL" +This expression should match a node or node's attribute containing the +article timestamp or date (parsable by PHP's strtotime function). It +should start with a dot followed by two forward slashes, referring to +any descendant nodes of the article item node. Attributes can be +selected by prepending an @ char before the attributes name. +EOL + , 'type' => 'text', + 'exampleValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp', + 'defaultValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp', + 'required' => false + ), + + 'enclosures' => array( + 'name' => 'Item image selector', + 'title' => <<<"EOL" +This expression should match a node's attribute containing an article +image URL (usually the src attribute of an <img> tag or a style +attribute). It should start with a dot followed by two forward slashes, +referring to any descendant nodes of the article item node. Attributes +can be selected by prepending an @ char before the attributes name. +EOL + , 'type' => 'text', + 'exampleValue' => './/div[@class="ArticleListItem-image"]/@style', + 'defaultValue' => './/div[@class="ArticleListItem-image"]/@style', + 'required' => false + ), + + 'categories' => array( + 'name' => 'Item category selector', + 'title' => <<<"EOL" +This expression should match a node or node's attribute contained +within each article item node containing the article category. This +could be inside <div> or <span> tags or sometimes be hidden +in a data attribute. It should start with a dot followed by two +forward slashes, referring to any descendant nodes of the article +item node. Attributes can be selected by prepending an @ char +before the attributes name. +EOL + , 'type' => 'text', + 'exampleValue' => './/div[@class="ArticleListItem-label"]', + 'defaultValue' => './/div[@class="ArticleListItem-label"]', + 'required' => false + ), + + 'fix_encoding' => array( + 'name' => 'Fix encoding', + 'title' => <<<"EOL" +Check this to fix feed encoding by invoking PHP's utf8_decode +function on all extracted texts. Try this in case you see "broken" or +"weird" characters in your feed where you'd normally expect umlauts +or any other non-ascii characters. +EOL + , 'type' => 'checkbox', + 'required' => false + ), + + ) + ); + + /** + * Source Web page URL (should provide either HTML or XML content) + * @return string + */ + protected function getSourceUrl(){ + return $this->encodeUri($this->getInput('url')); + } + + /** + * XPath expression for extracting the feed items from the source page + * @return string + */ + protected function getExpressionItem(){ + return urldecode($this->getInput('item')); + } + + /** + * XPath expression for extracting an item title from the item context + * @return string + */ + protected function getExpressionItemTitle(){ + return urldecode($this->getInput('title')); + } + + /** + * XPath expression for extracting an item's content from the item context + * @return string + */ + protected function getExpressionItemContent(){ + return urldecode($this->getInput('content')); + } + + /** + * XPath expression for extracting an item link from the item context + * @return string + */ + protected function getExpressionItemUri(){ + return urldecode($this->getInput('uri')); + } + + /** + * XPath expression for extracting an item author from the item context + * @return string + */ + protected function getExpressionItemAuthor(){ + return urldecode($this->getInput('author')); + } + + /** + * XPath expression for extracting an item timestamp from the item context + * @return string + */ + protected function getExpressionItemTimestamp(){ + return urldecode($this->getInput('timestamp')); + } + + /** + * XPath expression for extracting item enclosures (media content like + * images or movies) from the item context + * @return string + */ + protected function getExpressionItemEnclosures(){ + return urldecode($this->getInput('enclosures')); + } + + /** + * XPath expression for extracting an item category from the item context + * @return string + */ + protected function getExpressionItemCategories(){ + return urldecode($this->getInput('categories')); + } + + /** + * Fix encoding + * @return string + */ + protected function getSettingFixEncoding(){ + return $this->getInput('fix_encoding'); + } + + /** + * Fixes URL encoding issues in input URL's + * @param $uri + * @return string|string[] + */ + private function encodeUri($uri) + { + if (strpos($uri, 'https%3A%2F%2F') === 0 + || strpos($uri, 'http%3A%2F%2F') === 0) { + $uri = urldecode($uri); + } + + $uri = str_replace('|', '%7C', $uri); + + return $uri; + } +} diff --git a/composer.json b/composer.json index 3c03eeb0..7a386976 100644 --- a/composer.json +++ b/composer.json @@ -34,6 +34,7 @@ }, "suggest": { "ext-memcached": "Allows to use memcached as cache type", - "ext-sqlite3": "Allows to use an SQLite database for caching" + "ext-sqlite3": "Allows to use an SQLite database for caching", + "ext-dom": "Allows to use some bridges based on XPath expressions" } } diff --git a/lib/XPathAbstract.php b/lib/XPathAbstract.php new file mode 100644 index 00000000..e08f48d1 --- /dev/null +++ b/lib/XPathAbstract.php @@ -0,0 +1,583 @@ + or tag). This will + * be the context nodes for all of the following expressions. This expression usually + * starts with a single forward slash. + * + * Use {@see XPathAbstract::getExpressionItem()} to read this parameter + */ + const XPATH_EXPRESSION_ITEM = ''; + + /** + * XPath expression for extracting an item title from the item context + * This expression should match a node contained within each article item node + * containing the article headline. It should start with a dot followed by two + * forward slashes, referring to any descendant nodes of the article item node. + * + * Use {@see XPathAbstract::getExpressionItemTitle()} to read this parameter + */ + const XPATH_EXPRESSION_ITEM_TITLE = ''; + + /** + * XPath expression for extracting an item's content from the item context + * This expression should match a node contained within each article item node + * containing the article content or description. It should start with a dot + * followed by two forward slashes, referring to any descendant nodes of the + * article item node. + * + * Use {@see XPathAbstract::getExpressionItemContent()} to read this parameter + */ + const XPATH_EXPRESSION_ITEM_CONTENT = ''; + + /** + * XPath expression for extracting an item link from the item context + * This expression should match a node's attribute containing the article URL + * (usually the href attribute of an tag). It should start with a dot + * followed by two forward slashes, referring to any descendant nodes of + * the article item node. Attributes can be selected by prepending an @ char + * before the attributes name. + * + * Use {@see XPathAbstract::getExpressionItemUri()} to read this parameter + */ + const XPATH_EXPRESSION_ITEM_URI = ''; + + /** + * XPath expression for extracting an item author from the item context + * This expression should match a node contained within each article item + * node containing the article author's name. It should start with a dot + * followed by two forward slashes, referring to any descendant nodes of + * the article item node. + * + * Use {@see XPathAbstract::getExpressionItemAuthor()} to read this parameter + */ + const XPATH_EXPRESSION_ITEM_AUTHOR = ''; + + /** + * XPath expression for extracting an item timestamp from the item context + * This expression should match a node or node's attribute containing the + * article timestamp or date (parsable by PHP's strtotime function). It + * should start with a dot followed by two forward slashes, referring to + * any descendant nodes of the article item node. Attributes can be + * selected by prepending an @ char before the attributes name. + * + * Use {@see XPathAbstract::getExpressionItemTimestamp()} to read this parameter + */ + const XPATH_EXPRESSION_ITEM_TIMESTAMP = ''; + + /** + * XPath expression for extracting item enclosures (media content like + * images or movies) from the item context + * This expression should match a node's attribute containing an article + * image URL (usually the src attribute of an tag or a style + * attribute). It should start with a dot followed by two forward slashes, + * referring to any descendant nodes of the article item node. Attributes + * can be selected by prepending an @ char before the attributes name. + * + * Use {@see XPathAbstract::getExpressionItemEnclosures()} to read this parameter + */ + const XPATH_EXPRESSION_ITEM_ENCLOSURES = ''; + + /** + * XPath expression for extracting an item category from the item context + * This expression should match a node or node's attribute contained + * within each article item node containing the article category. This + * could be inside
or tags or sometimes be hidden + * in a data attribute. It should start with a dot followed by two + * forward slashes, referring to any descendant nodes of the article + * item node. Attributes can be selected by prepending an @ char + * before the attributes name. + * + * Use {@see XPathAbstract::getExpressionItemCategories()} to read this parameter + */ + const XPATH_EXPRESSION_ITEM_CATEGORIES = ''; + + /** + * Fix encoding + * Set this to true for fixing feed encoding by invoking PHP's utf8_decode + * function on all extracted texts. Try this in case you see "broken" or + * "weird" characters in your feed where you'd normally expect umlauts + * or any other non-ascii characters. + * + * Use {@see XPathAbstract::getSettingFixEncoding()} to read this parameter + */ + const SETTING_FIX_ENCODING = false; + + /** + * Internal storage for resulting feed name, automatically detected + * @var string + */ + private $feedName; + + /** + * Internal storage for resulting feed name, automatically detected + * @var string + */ + private $feedUri; + + /** + * Internal storage for resulting feed favicon, automatically detected + * @var string + */ + private $feedIcon; + + public function getName(){ + return $this->feedName ?: parent::getName(); + } + + public function getURI() { + return $this->feedUri ?: parent::getURI(); + } + + public function getIcon() { + return $this->feedIcon ?: parent::getIcon(); + } + + /** + * Source Web page URL (should provide either HTML or XML content) + * @return string + */ + protected function getSourceUrl(){ + return static::FEED_SOURCE_URL; + } + + /** + * XPath expression for extracting the feed title from the source page + * @return string + */ + protected function getExpressionTitle(){ + return static::XPATH_EXPRESSION_FEED_TITLE; + } + + /** + * XPath expression for extracting the feed favicon from the source page + * @return string + */ + protected function getExpressionIcon(){ + return static::XPATH_EXPRESSION_FEED_ICON; + } + + /** + * XPath expression for extracting the feed items from the source page + * @return string + */ + protected function getExpressionItem(){ + return static::XPATH_EXPRESSION_ITEM; + } + + /** + * XPath expression for extracting an item title from the item context + * @return string + */ + protected function getExpressionItemTitle(){ + return static::XPATH_EXPRESSION_ITEM_TITLE; + } + + /** + * XPath expression for extracting an item's content from the item context + * @return string + */ + protected function getExpressionItemContent(){ + return static::XPATH_EXPRESSION_ITEM_CONTENT; + } + + /** + * XPath expression for extracting an item link from the item context + * @return string + */ + protected function getExpressionItemUri(){ + return static::XPATH_EXPRESSION_ITEM_URI; + } + + /** + * XPath expression for extracting an item author from the item context + * @return string + */ + protected function getExpressionItemAuthor(){ + return static::XPATH_EXPRESSION_ITEM_AUTHOR; + } + + /** + * XPath expression for extracting an item timestamp from the item context + * @return string + */ + protected function getExpressionItemTimestamp(){ + return static::XPATH_EXPRESSION_ITEM_TIMESTAMP; + } + + /** + * XPath expression for extracting item enclosures (media content like + * images or movies) from the item context + * @return string + */ + protected function getExpressionItemEnclosures(){ + return static::XPATH_EXPRESSION_ITEM_ENCLOSURES; + } + + /** + * XPath expression for extracting an item category from the item context + * @return string + */ + protected function getExpressionItemCategories(){ + return static::XPATH_EXPRESSION_ITEM_CATEGORIES; + } + + /** + * Fix encoding + * @return string + */ + protected function getSettingFixEncoding(){ + return static::SETTING_FIX_ENCODING; + } + + /** + * Internal helper method for quickly accessing all the user defined constants + * in derived classes + * + * @param $name + * @return bool|string + */ + private function getParam($name){ + switch($name) { + + case 'url': + return $this->getSourceUrl(); + case 'feed_title': + return $this->getExpressionTitle(); + case 'feed_icon': + return $this->getExpressionIcon(); + case 'item': + return $this->getExpressionItem(); + case 'title': + return $this->getExpressionItemTitle(); + case 'content': + return $this->getExpressionItemContent(); + case 'uri': + return $this->getExpressionItemUri(); + case 'author': + return $this->getExpressionItemAuthor(); + case 'timestamp': + return $this->getExpressionItemTimestamp(); + case 'enclosures': + return $this->getExpressionItemEnclosures(); + case 'categories': + return $this->getExpressionItemCategories(); + case 'fix_encoding': + return $this->getSettingFixEncoding(); + } + } + + /** + * Should provide the source website HTML content + * can be easily overwritten for example if special headers or auth infos are required + * @return string + */ + protected function provideWebsiteContent() { + return getContents($this->feedUri); + } + + /** + * Should provide the feeds title + * + * @param DOMXPath $xpath + * @return string + */ + protected function provideFeedTitle(DOMXPath $xpath) { + $title = $xpath->query($this->getParam('feed_title')); + if(count($title) === 1) { + return $this->getItemValueOrNodeValue($title); + } + } + + /** + * Should provide the URL of the feed's favicon + * + * @param DOMXPath $xpath + * @return string + */ + protected function provideFeedIcon(DOMXPath $xpath) { + $icon = $xpath->query($this->getParam('feed_icon')); + if(count($icon) === 1) { + return $this->cleanImageUrl($this->getItemValueOrNodeValue($icon)); + } + } + + /** + * Should provide the feed's items. + * + * @param DOMXPath $xpath + * @return DOMNodeList + */ + protected function provideFeedItems(DOMXPath $xpath) { + return @$xpath->query($this->getParam('item')); + } + + public function collectData() { + + $this->feedUri = $this->getParam('url'); + + $webPageHtml = new DOMDocument(); + libxml_use_internal_errors(true); + $webPageHtml->loadHTML($this->provideWebsiteContent()); + libxml_clear_errors(); + libxml_use_internal_errors(false); + + $xpath = new DOMXPath($webPageHtml); + + $this->feedName = $this->provideFeedTitle($xpath); + $this->feedIcon = $this->provideFeedIcon($xpath); + + $entries = $this->provideFeedItems($xpath); + if($entries === false) { + return; + } + + foreach ($entries as $entry) { + $item = new \FeedItem(); + foreach(array('title', 'content', 'uri', 'author', 'timestamp', 'enclosures', 'categories') as $param) { + + $expression = $this->getParam($param); + if('' === $expression) { + continue; + } + + //can be a string or DOMNodeList, depending on the expression result + $typedResult = @$xpath->evaluate($expression, $entry); + if ($typedResult === false || ($typedResult instanceof DOMNodeList && count($typedResult) === 0) + || (is_string($typedResult) && strlen(trim($typedResult)) === 0)) { + continue; + } + + $item->__set($param, $this->formatParamValue($param, $this->getItemValueOrNodeValue($typedResult))); + + } + + $itemId = $this->generateItemId($item); + if(null !== $itemId) { + $item->setUid($itemId); + } + + $this->items[] = $item; + } + + } + + /** + * @param $param + * @param $value + * @return string|array + */ + protected function formatParamValue($param, $value) + { + $value = $this->fixEncoding($value); + switch ($param) { + case 'title': + return $this->formatItemTitle($value); + case 'content': + return $this->formatItemContent($value); + case 'uri': + return $this->formatItemUri($value); + case 'author': + return $this->formatItemAuthor($value); + case 'timestamp': + return $this->formatItemTimestamp($value); + case 'enclosures': + return array($this->cleanImageUrl($value)); + case 'categories': + return array($this->fixEncoding($value)); + } + return $value; + } + + /** + * Formats the title of a feed item. Takes extracted raw title and returns it formatted + * as string. + * Can be easily overwritten for in case the value needs to be transformed into something + * else. + * @param string $value + * @return string + */ + protected function formatItemTitle($value) { + return $value; + } + + /** + * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix + * timestamp as integer. + * Can be easily overwritten for example if a special format has to be expected on the + * source website. + * @param string $value + * @return string + */ + protected function formatItemContent($value) { + return $value; + } + + /** + * Formats the URI of a feed item. Takes extracted raw URI and returns it formatted + * as string. + * Can be easily overwritten for in case the value needs to be transformed into something + * else. + * @param string $value + * @return string + */ + protected function formatItemUri($value) { + if(strlen($value) === 0) { + return ''; + } + if(strpos($value, 'http://') === 0 || strpos($value, 'https://') === 0) { + return $value; + } + + return urljoin($this->feedUri, $value); + } + + /** + * Formats the author of a feed item. Takes extracted raw author and returns it formatted + * as string. + * Can be easily overwritten for in case the value needs to be transformed into something + * else. + * @param string $value + * @return string + */ + protected function formatItemAuthor($value) { + return $value; + } + + /** + * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix + * timestamp as integer. + * Can be easily overwritten for example if a special format has to be expected on the + * source website. + * @param string $value + * @return false|int + */ + protected function formatItemTimestamp($value) { + return strtotime($value); + } + + /** + * Formats the enclosures of a feed item. Takes extracted raw enclosures and returns them + * formatted as array. + * Can be easily overwritten for in case the values need to be transformed into something + * else. + * @param string $value + * @return array + */ + protected function formatItemEnclosures($value) { + return array($this->cleanImageUrl($value)); + } + + /** + * Formats the categories of a feed item. Takes extracted raw categories and returns them + * formatted as array. + * Can be easily overwritten for in case the values need to be transformed into something + * else. + * @param string $value + * @return array + */ + protected function formatItemCategories($value) { + return array($value); + } + + /** + * @param $imageUrl + * @return string|void + */ + protected function cleanImageUrl($imageUrl) + { + $result = preg_match('~(?:http(?:s)?:)?[\/a-zA-Z0-9\-_\.]+\.(?:jpg|gif|png|jpeg|ico){1}~', $imageUrl, $matches); + if(1 !== $result) { + return; + } + return urljoin($this->feedUri, $matches[0]); + } + + /** + * @param $typedResult + * @return string + */ + protected function getItemValueOrNodeValue($typedResult) + { + if($typedResult instanceof DOMNodeList) { + $item = $typedResult->item(0); + if ($item instanceof DOMElement) { + return trim($item->nodeValue); + } elseif ($item instanceof DOMAttr) { + return trim($item->value); + } + } elseif(is_string($typedResult) && strlen($typedResult) > 0) { + return trim($typedResult); + } + returnServerError('Unknown type of XPath expression result.'); + } + + /** + * Fixes feed encoding by invoking PHP's utf8_decode function on extracted texts. + * Useful in case of "broken" or "weird" characters in the feed where you'd normally + * expect umlauts. + * + * @param $input + * @return string + */ + protected function fixEncoding($input) + { + return $this->getParam('fix_encoding') ? utf8_decode($input) : $input; + } + + /** + * Allows overriding default mechanism determining items Uid's + * + * @param FeedItem $item + * @return string|null + */ + protected function generateItemId(\FeedItem $item) { + return null; //auto generation + } +} diff --git a/lib/rssbridge.php b/lib/rssbridge.php index 25232986..2e7fbf2a 100644 --- a/lib/rssbridge.php +++ b/lib/rssbridge.php @@ -74,6 +74,7 @@ require_once PATH_LIB . 'BridgeList.php'; require_once PATH_LIB . 'ParameterValidator.php'; require_once PATH_LIB . 'ActionFactory.php'; require_once PATH_LIB . 'ActionAbstract.php'; +require_once PATH_LIB . 'XPathAbstract.php'; // Functions require_once PATH_LIB . 'html.php';