[BridgeXPathAbstract + BlizzardNewsBridge + XPathBridge] Add new abstract class + two example implementations (#1671)

2020-11-08 08:22:41 +01:00 · 2020-11-08 08:22:41 +01:00 · 3ad138026d
commit 3ad138026d
parent d05a8b79fe
5 changed files with 897 additions and 1 deletions
--- a/bridges/BlizzardNewsBridge.php
+++ b/bridges/BlizzardNewsBridge.php
@ -0,0 +1,60 @@
+<?php
+
+class BlizzardNewsBridge extends XPathAbstract {
+
+	const NAME = 'Blizzard News';
+	const URI = 'https://news.blizzard.com';
+	const DESCRIPTION = 'Blizzard (game company) newsfeed';
+	const MAINTAINER = 'Niehztog';
+	const PARAMETERS = array(
+		'' => array(
+			'locale' => array(
+				'name' => 'Language',
+				'type' => 'list',
+				'values' => array(
+					'Deutsch' => 'de-de',
+					'English (EU)' => 'en-gb',
+					'English (US)' => 'en-us',
+					'Español (EU)' => 'es-es',
+					'Español (AL)' => 'es-mx',
+					'Français' => 'fr-fr',
+					'Italiano' => 'it-it',
+					'日本語' => 'ja-jp',
+					'한국어' => 'ko-kr',
+					'Polski' => 'pl-pl',
+					'Português (AL)' => 'pt-br',
+					'Русский' => 'ru-ru',
+					'ภาษาไทย' => 'th-th',
+					'简体中文' => 'zh-cn',
+					'繁體中文' => 'zh-tw'
+				),
+				'defaultValue' => 'en-us',
+				'title' => 'Select your language'
+			)
+		)
+	);
+	const CACHE_TIMEOUT = 3600;
+
+	const XPATH_EXPRESSION_ITEM = '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article';
+	const XPATH_EXPRESSION_ITEM_TITLE = './/div/div[2]/h2';
+	const XPATH_EXPRESSION_ITEM_CONTENT = './/div[@class="ArticleListItem-description"]/div[@class="h6"]';
+	const XPATH_EXPRESSION_ITEM_URI = './/a[@class="ArticleLink ArticleLink"]/@href';
+	const XPATH_EXPRESSION_ITEM_AUTHOR = '';
+	const XPATH_EXPRESSION_ITEM_TIMESTAMP = './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp';
+	const XPATH_EXPRESSION_ITEM_ENCLOSURES = './/div[@class="ArticleListItem-image"]/@style';
+	const XPATH_EXPRESSION_ITEM_CATEGORIES = './/div[@class="ArticleListItem-label"]';
+	const SETTING_FIX_ENCODING = true;
+
+	/**
+	 * Source Web page URL (should provide either HTML or XML content)
+	 * @return string
+	 */
+	protected function getSourceUrl(){
+
+		$locale = $this->getInput('locale');
+		if('zh-cn' === $locale) {
+			return 'https://cn.news.blizzard.com';
+		}
+		return 'https://news.blizzard.com/' . $locale;
+	}
+}
--- a/bridges/XPathBridge.php
+++ b/bridges/XPathBridge.php
@ -0,0 +1,251 @@
+<?php
+
+class XPathBridge extends XPathAbstract {
+	const NAME = 'XPathBridge';
+	const URI = 'https://github.com/rss-bridge/rss-bridge';
+	const DESCRIPTION
+		= 'Parse any webpage using <a href="https://devhints.io/xpath" target="_blank">XPath expressions</a>';
+	const MAINTAINER = 'Niehztog';
+	const PARAMETERS = array(
+		'' => array(
+
+			'url' => array(
+				'name' => 'Enter web page URL',
+				'title' => <<<"EOL"
+You can specify any website URL which serves data suited for display in RSS feeds
+(for example a news blog).
+EOL
+				, 'type' => 'text',
+				'exampleValue' => 'https://news.blizzard.com/en-en',
+				'defaultValue' => 'https://news.blizzard.com/en-en',
+				'required' => true
+			),
+
+			'item' => array(
+				'name' => 'Item selector',
+				'title' => <<<"EOL"
+Enter an XPath expression matching a list of dom nodes, each node containing one
+feed article item in total (usually a surrounding &lt;div&gt; or &lt;span&gt; tag). This will
+be the context nodes for all of the following expressions. This expression usually
+starts with a single forward slash.
+EOL
+				, 'type' => 'text',
+				'exampleValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article',
+				'defaultValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article',
+				'required' => true
+			),
+
+			'title' => array(
+				'name' => 'Item title selector',
+				'title' => <<<"EOL"
+This expression should match a node contained within each article item node
+containing the article headline. It should start with a dot followed by two
+forward slashes, referring to any descendant nodes of the article item node.
+EOL
+				, 'type' => 'text',
+				'exampleValue' => './/div/div[2]/h2',
+				'defaultValue' => './/div/div[2]/h2',
+				'required' => true
+			),
+
+			'content' => array(
+				'name' => 'Item description selector',
+				'title' => <<<"EOL"
+This expression should match a node contained within each article item node
+containing the article content or description. It should start with a dot
+followed by two forward slashes, referring to any descendant nodes of the
+article item node.
+EOL
+				, 'type' => 'text',
+				'exampleValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]',
+				'defaultValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]',
+				'required' => false
+			),
+
+			'uri' => array(
+				'name' => 'Item URL selector',
+				'title' => <<<"EOL"
+This expression should match a node's attribute containing the article URL
+(usually the href attribute of an &lt;a&gt; tag). It should start with a dot
+followed by two forward slashes, referring to any descendant nodes of
+the article item node. Attributes can be selected by prepending an @ char
+before the attributes name.
+EOL
+				, 'type' => 'text',
+				'exampleValue' => './/a[@class="ArticleLink ArticleLink"]/@href',
+				'defaultValue' => './/a[@class="ArticleLink ArticleLink"]/@href',
+				'required' => false
+			),
+
+			'author' => array(
+				'name' => 'Item author selector',
+				'title' => <<<"EOL"
+This expression should match a node contained within each article item
+node containing the article author's name. It should start with a dot
+followed by two forward slashes, referring to any descendant nodes of
+the article item node.
+EOL
+				, 'type' => 'text',
+				'required' => false
+			),
+
+			'timestamp' => array(
+				'name' => 'Item date selector',
+				'title' => <<<"EOL"
+This expression should match a node or node's attribute containing the
+article timestamp or date (parsable by PHP's strtotime function). It
+should start with a dot followed by two forward slashes, referring to
+any descendant nodes of the article item node. Attributes can be
+selected by prepending an @ char before the attributes name.
+EOL
+				, 'type' => 'text',
+				'exampleValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp',
+				'defaultValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp',
+				'required' => false
+			),
+
+			'enclosures' => array(
+				'name' => 'Item image selector',
+				'title' => <<<"EOL"
+This expression should match a node's attribute containing an article
+image URL (usually the src attribute of an &lt;img&gt; tag or a style
+attribute). It should start with a dot followed by two forward slashes,
+referring to any descendant nodes of the article item node. Attributes
+can be selected by prepending an @ char before the attributes name.
+EOL
+				, 'type' => 'text',
+				'exampleValue' => './/div[@class="ArticleListItem-image"]/@style',
+				'defaultValue' => './/div[@class="ArticleListItem-image"]/@style',
+				'required' => false
+			),
+
+			'categories' => array(
+				'name' => 'Item category selector',
+				'title' => <<<"EOL"
+This expression should match a node or node's attribute contained
+within each article item node containing the article category. This
+could be inside &lt;div&gt; or &lt;span&gt; tags or sometimes be hidden
+in a data attribute. It should start with a dot followed by two
+forward slashes, referring to any descendant nodes of the article
+item node. Attributes can be selected by prepending an @ char
+before the attributes name.
+EOL
+				, 'type' => 'text',
+				'exampleValue' => './/div[@class="ArticleListItem-label"]',
+				'defaultValue' => './/div[@class="ArticleListItem-label"]',
+				'required' => false
+			),
+
+			'fix_encoding' => array(
+				'name' => 'Fix encoding',
+				'title' => <<<"EOL"
+Check this to fix feed encoding by invoking PHP's utf8_decode
+function on all extracted texts. Try this in case you see "broken" or
+"weird" characters in your feed where you'd normally expect umlauts
+or any other non-ascii characters.
+EOL
+				, 'type' => 'checkbox',
+				'required' => false
+			),
+
+		)
+	);
+
+	/**
+	 * Source Web page URL (should provide either HTML or XML content)
+	 * @return string
+	 */
+	protected function getSourceUrl(){
+		return $this->encodeUri($this->getInput('url'));
+	}
+
+	/**
+	 * XPath expression for extracting the feed items from the source page
+	 * @return string
+	 */
+	protected function getExpressionItem(){
+		return urldecode($this->getInput('item'));
+	}
+
+	/**
+	 * XPath expression for extracting an item title from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemTitle(){
+		return urldecode($this->getInput('title'));
+	}
+
+	/**
+	 * XPath expression for extracting an item's content from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemContent(){
+		return urldecode($this->getInput('content'));
+	}
+
+	/**
+	 * XPath expression for extracting an item link from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemUri(){
+		return urldecode($this->getInput('uri'));
+	}
+
+	/**
+	 * XPath expression for extracting an item author from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemAuthor(){
+		return urldecode($this->getInput('author'));
+	}
+
+	/**
+	 * XPath expression for extracting an item timestamp from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemTimestamp(){
+		return urldecode($this->getInput('timestamp'));
+	}
+
+	/**
+	 * XPath expression for extracting item enclosures (media content like
+	 * images or movies) from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemEnclosures(){
+		return urldecode($this->getInput('enclosures'));
+	}
+
+	/**
+	 * XPath expression for extracting an item category from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemCategories(){
+		return urldecode($this->getInput('categories'));
+	}
+
+	/**
+	 * Fix encoding
+	 * @return string
+	 */
+	protected function getSettingFixEncoding(){
+		return $this->getInput('fix_encoding');
+	}
+
+	/**
+	 * Fixes URL encoding issues in input URL's
+	 * @param $uri
+	 * @return string|string[]
+	 */
+	private function encodeUri($uri)
+	{
+		if (strpos($uri, 'https%3A%2F%2F') === 0
+			|| strpos($uri, 'http%3A%2F%2F') === 0) {
+			$uri = urldecode($uri);
+		}
+
+		$uri = str_replace('|', '%7C', $uri);
+
+		return $uri;
+	}
+}
--- a/composer.json
+++ b/composer.json
@ -34,6 +34,7 @@
    },
    "suggest": {
        "ext-memcached": "Allows to use memcached as cache type",
-        "ext-sqlite3": "Allows to use an SQLite database for caching"
+        "ext-sqlite3": "Allows to use an SQLite database for caching",
+        "ext-dom": "Allows to use some bridges based on XPath expressions"
    }
 }
--- a/lib/XPathAbstract.php
+++ b/lib/XPathAbstract.php
@ -0,0 +1,583 @@
+<?php
+
+/**
+ * An alternative abstract class for bridges utilizing XPath expressions
+ *
+ * This class is meant as an alternative base class for bridge implementations.
+ * It offers preliminary functionality for generating feeds based on XPath
+ * expressions.
+ * As a minimum, extending classes should define XPath expressions pointing
+ * to the feed items contents in the class constants below. In case there is
+ * more manual fine tuning required, it offers a bunch of methods which can
+ * be overridden, for example in order to specify formatting of field values
+ * or more flexible definition of dynamic XPath expressions.
+ *
+ * This class extends {@see BridgeAbstract}, which means it incorporates and
+ * extends all of its functionality.
+ **/
+abstract class XPathAbstract extends BridgeAbstract {
+
+	/**
+	 * Source Web page URL (should provide either HTML or XML content)
+	 * You can specify any website URL which serves data suited for display in RSS feeds
+	 * (for example a news blog).
+	 *
+	 * Use {@see XPathAbstract::getSourceUrl()} to read this parameter
+	 */
+	const FEED_SOURCE_URL = '';
+
+	/**
+	 * XPath expression for extracting the feed title from the source page.
+	 * If this is left blank or does not provide any data {@see BridgeAbstract::getName()}
+	 * is used instead as the feed's title.
+	 *
+	 * Use {@see XPathAbstract::getExpressionTitle()} to read this parameter
+	 */
+	const XPATH_EXPRESSION_FEED_TITLE = './/title';
+
+	/**
+	 * XPath expression for extracting the feed favicon URL from the source page.
+	 * If this is left blank or does not provide any data {@see BridgeAbstract::getIcon()}
+	 * is used instead as the feed's favicon URL.
+	 *
+	 * Use {@see XPathAbstract::getExpressionIcon()} to read this parameter
+	 */
+	const XPATH_EXPRESSION_FEED_ICON = './/link[@rel="icon"]/@href';
+
+	/**
+	 * XPath expression for extracting the feed items from the source page
+	 * Enter an XPath expression matching a list of dom nodes, each node containing one
+	 * feed article item in total (usually a surrounding <div> or <span> tag). This will
+	 * be the context nodes for all of the following expressions. This expression usually
+	 * starts with a single forward slash.
+	 *
+	 * Use {@see XPathAbstract::getExpressionItem()} to read this parameter
+	 */
+	const XPATH_EXPRESSION_ITEM = '';
+
+	/**
+	 * XPath expression for extracting an item title from the item context
+	 * This expression should match a node contained within each article item node
+	 * containing the article headline. It should start with a dot followed by two
+	 * forward slashes, referring to any descendant nodes of the article item node.
+	 *
+	 * Use {@see XPathAbstract::getExpressionItemTitle()} to read this parameter
+	 */
+	const XPATH_EXPRESSION_ITEM_TITLE = '';
+
+	/**
+	 * XPath expression for extracting an item's content from the item context
+	 * This expression should match a node contained within each article item node
+	 * containing the article content or description. It should start with a dot
+	 * followed by two forward slashes, referring to any descendant nodes of the
+	 * article item node.
+	 *
+	 * Use {@see XPathAbstract::getExpressionItemContent()} to read this parameter
+	 */
+	const XPATH_EXPRESSION_ITEM_CONTENT = '';
+
+	/**
+	 * XPath expression for extracting an item link from the item context
+	 * This expression should match a node's attribute containing the article URL
+	 * (usually the href attribute of an <a> tag). It should start with a dot
+	 * followed by two forward slashes, referring to any descendant nodes of
+	 * the article item node. Attributes can be selected by prepending an @ char
+	 * before the attributes name.
+	 *
+	 * Use {@see XPathAbstract::getExpressionItemUri()} to read this parameter
+	 */
+	const XPATH_EXPRESSION_ITEM_URI = '';
+
+	/**
+	 * XPath expression for extracting an item author from the item context
+	 * This expression should match a node contained within each article item
+	 * node containing the article author's name. It should start with a dot
+	 * followed by two forward slashes, referring to any descendant nodes of
+	 * the article item node.
+	 *
+	 * Use {@see XPathAbstract::getExpressionItemAuthor()} to read this parameter
+	 */
+	const XPATH_EXPRESSION_ITEM_AUTHOR = '';
+
+	/**
+	 * XPath expression for extracting an item timestamp from the item context
+	 * This expression should match a node or node's attribute containing the
+	 * article timestamp or date (parsable by PHP's strtotime function). It
+	 * should start with a dot followed by two forward slashes, referring to
+	 * any descendant nodes of the article item node. Attributes can be
+	 * selected by prepending an @ char before the attributes name.
+	 *
+	 * Use {@see XPathAbstract::getExpressionItemTimestamp()} to read this parameter
+	 */
+	const XPATH_EXPRESSION_ITEM_TIMESTAMP = '';
+
+	/**
+	 * XPath expression for extracting item enclosures (media content like
+	 * images or movies) from the item context
+	 * This expression should match a node's attribute containing an article
+	 * image URL (usually the src attribute of an <img> tag or a style
+	 * attribute). It should start with a dot followed by two forward slashes,
+	 * referring to any descendant nodes of the article item node. Attributes
+	 * can be selected by prepending an @ char before the attributes name.
+	 *
+	 * Use {@see XPathAbstract::getExpressionItemEnclosures()} to read this parameter
+	 */
+	const XPATH_EXPRESSION_ITEM_ENCLOSURES = '';
+
+	/**
+	 * XPath expression for extracting an item category from the item context
+	 * This expression should match a node or node's attribute contained
+	 * within each article item node containing the article category. This
+	 * could be inside <div> or <span> tags or sometimes be hidden
+	 * in a data attribute. It should start with a dot followed by two
+	 * forward slashes, referring to any descendant nodes of the article
+	 * item node. Attributes can be selected by prepending an @ char
+	 * before the attributes name.
+	 *
+	 * Use {@see XPathAbstract::getExpressionItemCategories()} to read this parameter
+	 */
+	const XPATH_EXPRESSION_ITEM_CATEGORIES = '';
+
+	/**
+	 * Fix encoding
+	 * Set this to true for fixing feed encoding by invoking PHP's utf8_decode
+	 * function on all extracted texts. Try this in case you see "broken" or
+	 * "weird" characters in your feed where you'd normally expect umlauts
+	 * or any other non-ascii characters.
+	 *
+	 * Use {@see XPathAbstract::getSettingFixEncoding()} to read this parameter
+	 */
+	const SETTING_FIX_ENCODING = false;
+
+	/**
+	 * Internal storage for resulting feed name, automatically detected
+	 * @var string
+	 */
+	private $feedName;
+
+	/**
+	 * Internal storage for resulting feed name, automatically detected
+	 * @var string
+	 */
+	private $feedUri;
+
+	/**
+	 * Internal storage for resulting feed favicon, automatically detected
+	 * @var string
+	 */
+	private $feedIcon;
+
+	public function getName(){
+		return $this->feedName ?: parent::getName();
+	}
+
+	public function getURI() {
+		return $this->feedUri ?: parent::getURI();
+	}
+
+	public function getIcon() {
+		return $this->feedIcon ?: parent::getIcon();
+	}
+
+	/**
+	 * Source Web page URL (should provide either HTML or XML content)
+	 * @return string
+	 */
+	protected function getSourceUrl(){
+		return static::FEED_SOURCE_URL;
+	}
+
+	/**
+	 * XPath expression for extracting the feed title from the source page
+	 * @return string
+	 */
+	protected function getExpressionTitle(){
+		return static::XPATH_EXPRESSION_FEED_TITLE;
+	}
+
+	/**
+	 * XPath expression for extracting the feed favicon from the source page
+	 * @return string
+	 */
+	protected function getExpressionIcon(){
+		return static::XPATH_EXPRESSION_FEED_ICON;
+	}
+
+	/**
+	 * XPath expression for extracting the feed items from the source page
+	 * @return string
+	 */
+	protected function getExpressionItem(){
+		return static::XPATH_EXPRESSION_ITEM;
+	}
+
+	/**
+	 * XPath expression for extracting an item title from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemTitle(){
+		return static::XPATH_EXPRESSION_ITEM_TITLE;
+	}
+
+	/**
+	 * XPath expression for extracting an item's content from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemContent(){
+		return static::XPATH_EXPRESSION_ITEM_CONTENT;
+	}
+
+	/**
+	 * XPath expression for extracting an item link from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemUri(){
+		return static::XPATH_EXPRESSION_ITEM_URI;
+	}
+
+	/**
+	 * XPath expression for extracting an item author from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemAuthor(){
+		return static::XPATH_EXPRESSION_ITEM_AUTHOR;
+	}
+
+	/**
+	 * XPath expression for extracting an item timestamp from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemTimestamp(){
+		return static::XPATH_EXPRESSION_ITEM_TIMESTAMP;
+	}
+
+	/**
+	 * XPath expression for extracting item enclosures (media content like
+	 * images or movies) from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemEnclosures(){
+		return static::XPATH_EXPRESSION_ITEM_ENCLOSURES;
+	}
+
+	/**
+	 * XPath expression for extracting an item category from the item context
+	 * @return string
+	 */
+	protected function getExpressionItemCategories(){
+		return static::XPATH_EXPRESSION_ITEM_CATEGORIES;
+	}
+
+	/**
+	 * Fix encoding
+	 * @return string
+	 */
+	protected function getSettingFixEncoding(){
+		return static::SETTING_FIX_ENCODING;
+	}
+
+	/**
+	 * Internal helper method for quickly accessing all the user defined constants
+	 * in derived classes
+	 *
+	 * @param $name
+	 * @return bool|string
+	 */
+	private function getParam($name){
+		switch($name) {
+
+			case 'url':
+				return $this->getSourceUrl();
+			case 'feed_title':
+				return $this->getExpressionTitle();
+			case 'feed_icon':
+				return $this->getExpressionIcon();
+			case 'item':
+				return $this->getExpressionItem();
+			case 'title':
+				return $this->getExpressionItemTitle();
+			case 'content':
+				return $this->getExpressionItemContent();
+			case 'uri':
+				return $this->getExpressionItemUri();
+			case 'author':
+				return $this->getExpressionItemAuthor();
+			case 'timestamp':
+				return $this->getExpressionItemTimestamp();
+			case 'enclosures':
+				return $this->getExpressionItemEnclosures();
+			case 'categories':
+				return $this->getExpressionItemCategories();
+			case 'fix_encoding':
+				return $this->getSettingFixEncoding();
+		}
+	}
+
+	/**
+	 * Should provide the source website HTML content
+	 * can be easily overwritten for example if special headers or auth infos are required
+	 * @return string
+	 */
+	protected function provideWebsiteContent() {
+		return getContents($this->feedUri);
+	}
+
+	/**
+	 * Should provide the feeds title
+	 *
+	 * @param DOMXPath $xpath
+	 * @return string
+	 */
+	protected function provideFeedTitle(DOMXPath $xpath) {
+		$title = $xpath->query($this->getParam('feed_title'));
+		if(count($title) === 1) {
+			return $this->getItemValueOrNodeValue($title);
+		}
+	}
+
+	/**
+	 * Should provide the URL of the feed's favicon
+	 *
+	 * @param DOMXPath $xpath
+	 * @return string
+	 */
+	protected function provideFeedIcon(DOMXPath $xpath) {
+		$icon = $xpath->query($this->getParam('feed_icon'));
+		if(count($icon) === 1) {
+			return $this->cleanImageUrl($this->getItemValueOrNodeValue($icon));
+		}
+	}
+
+	/**
+	 * Should provide the feed's items.
+	 *
+	 * @param DOMXPath $xpath
+	 * @return DOMNodeList
+	 */
+	protected function provideFeedItems(DOMXPath $xpath) {
+		return @$xpath->query($this->getParam('item'));
+	}
+
+	public function collectData() {
+
+		$this->feedUri = $this->getParam('url');
+
+		$webPageHtml = new DOMDocument();
+		libxml_use_internal_errors(true);
+		$webPageHtml->loadHTML($this->provideWebsiteContent());
+		libxml_clear_errors();
+		libxml_use_internal_errors(false);
+
+		$xpath = new DOMXPath($webPageHtml);
+
+		$this->feedName = $this->provideFeedTitle($xpath);
+		$this->feedIcon = $this->provideFeedIcon($xpath);
+
+		$entries = $this->provideFeedItems($xpath);
+		if($entries === false) {
+			return;
+		}
+
+		foreach ($entries as $entry) {
+			$item = new \FeedItem();
+			foreach(array('title', 'content', 'uri', 'author', 'timestamp', 'enclosures', 'categories') as $param) {
+
+				$expression = $this->getParam($param);
+				if('' === $expression) {
+					continue;
+				}
+
+				//can be a string or DOMNodeList, depending on the expression result
+				$typedResult = @$xpath->evaluate($expression, $entry);
+				if ($typedResult === false || ($typedResult instanceof DOMNodeList && count($typedResult) === 0)
+					|| (is_string($typedResult) && strlen(trim($typedResult)) === 0)) {
+					continue;
+				}
+
+				$item->__set($param, $this->formatParamValue($param, $this->getItemValueOrNodeValue($typedResult)));
+
+			}
+
+			$itemId = $this->generateItemId($item);
+			if(null !== $itemId) {
+				$item->setUid($itemId);
+			}
+
+			$this->items[] = $item;
+		}
+
+	}
+
+	/**
+	 * @param $param
+	 * @param $value
+	 * @return string|array
+	 */
+	protected function formatParamValue($param, $value)
+	{
+		$value = $this->fixEncoding($value);
+		switch ($param) {
+			case 'title':
+				return $this->formatItemTitle($value);
+			case 'content':
+				return $this->formatItemContent($value);
+			case 'uri':
+				return $this->formatItemUri($value);
+			case 'author':
+				return $this->formatItemAuthor($value);
+			case 'timestamp':
+				return $this->formatItemTimestamp($value);
+			case 'enclosures':
+				return array($this->cleanImageUrl($value));
+			case 'categories':
+				return array($this->fixEncoding($value));
+		}
+		return $value;
+	}
+
+	/**
+	 * Formats the title of a feed item. Takes extracted raw title and returns it formatted
+	 * as string.
+	 * Can be easily overwritten for in case the value needs to be transformed into something
+	 * else.
+	 * @param string $value
+	 * @return string
+	 */
+	protected function formatItemTitle($value) {
+		return $value;
+	}
+
+	/**
+	 * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix
+	 * timestamp as integer.
+	 * Can be easily overwritten for example if a special format has to be expected on the
+	 * source website.
+	 * @param string $value
+	 * @return string
+	 */
+	protected function formatItemContent($value) {
+		return $value;
+	}
+
+	/**
+	 * Formats the URI of a feed item. Takes extracted raw URI and returns it formatted
+	 * as string.
+	 * Can be easily overwritten for in case the value needs to be transformed into something
+	 * else.
+	 * @param string $value
+	 * @return string
+	 */
+	protected function formatItemUri($value) {
+		if(strlen($value) === 0) {
+			return '';
+		}
+		if(strpos($value, 'http://') === 0 || strpos($value, 'https://') === 0) {
+			return $value;
+		}
+
+		return urljoin($this->feedUri, $value);
+	}
+
+	/**
+	 * Formats the author of a feed item. Takes extracted raw author and returns it formatted
+	 * as string.
+	 * Can be easily overwritten for in case the value needs to be transformed into something
+	 * else.
+	 * @param string $value
+	 * @return string
+	 */
+	protected function formatItemAuthor($value) {
+		return $value;
+	}
+
+	/**
+	 * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix
+	 * timestamp as integer.
+	 * Can be easily overwritten for example if a special format has to be expected on the
+	 * source website.
+	 * @param string $value
+	 * @return false|int
+	 */
+	protected function formatItemTimestamp($value) {
+		return strtotime($value);
+	}
+
+	/**
+	 * Formats the enclosures of a feed item. Takes extracted raw enclosures and returns them
+	 * formatted as array.
+	 * Can be easily overwritten for in case the values need to be transformed into something
+	 * else.
+	 * @param string $value
+	 * @return array
+	 */
+	protected function formatItemEnclosures($value) {
+		return array($this->cleanImageUrl($value));
+	}
+
+	/**
+	 * Formats the categories of a feed item. Takes extracted raw categories and returns them
+	 * formatted as array.
+	 * Can be easily overwritten for in case the values need to be transformed into something
+	 * else.
+	 * @param string $value
+	 * @return array
+	 */
+	protected function formatItemCategories($value) {
+		return array($value);
+	}
+
+	/**
+	 * @param $imageUrl
+	 * @return string|void
+	 */
+	protected function cleanImageUrl($imageUrl)
+	{
+		$result = preg_match('~(?:http(?:s)?:)?[\/a-zA-Z0-9\-_\.]+\.(?:jpg|gif|png|jpeg|ico){1}~', $imageUrl, $matches);
+		if(1 !== $result) {
+			return;
+		}
+		return urljoin($this->feedUri, $matches[0]);
+	}
+
+	/**
+	 * @param $typedResult
+	 * @return string
+	 */
+	protected function getItemValueOrNodeValue($typedResult)
+	{
+		if($typedResult instanceof DOMNodeList) {
+			$item = $typedResult->item(0);
+			if ($item instanceof DOMElement) {
+				return trim($item->nodeValue);
+			} elseif ($item instanceof DOMAttr) {
+				return trim($item->value);
+			}
+		} elseif(is_string($typedResult) && strlen($typedResult) > 0) {
+			return trim($typedResult);
+		}
+		returnServerError('Unknown type of XPath expression result.');
+	}
+
+	/**
+	 * Fixes feed encoding by invoking PHP's utf8_decode function on extracted texts.
+	 * Useful in case of "broken" or "weird" characters in the feed where you'd normally
+	 * expect umlauts.
+	 *
+	 * @param $input
+	 * @return string
+	 */
+	protected function fixEncoding($input)
+	{
+		return $this->getParam('fix_encoding') ? utf8_decode($input) : $input;
+	}
+
+	/**
+	 * Allows overriding default mechanism determining items Uid's
+	 *
+	 * @param FeedItem $item
+	 * @return string|null
+	 */
+	protected function generateItemId(\FeedItem $item) {
+		return null; //auto generation
+	}
+}
--- a/lib/rssbridge.php
+++ b/lib/rssbridge.php
@ -74,6 +74,7 @@ require_once PATH_LIB . 'BridgeList.php';
 require_once PATH_LIB . 'ParameterValidator.php';
 require_once PATH_LIB . 'ActionFactory.php';
 require_once PATH_LIB . 'ActionAbstract.php';
+require_once PATH_LIB . 'XPathAbstract.php';

 // Functions
 require_once PATH_LIB . 'html.php';