<?php

/**
 * An alternative abstract class for bridges utilizing XPath expressions
 *
 * This class is meant as an alternative base class for bridge implementations.
 * It offers preliminary functionality for generating feeds based on XPath
 * expressions.
 * As a minimum, extending classes should define XPath expressions pointing
 * to the feed items contents in the class constants below. In case there is
 * more manual fine tuning required, it offers a bunch of methods which can
 * be overridden, for example in order to specify formatting of field values
 * or more flexible definition of dynamic XPath expressions.
 *
 * This class extends {@see BridgeAbstract}, which means it incorporates and
 * extends all of its functionality.
 **/
abstract class XPathAbstract extends BridgeAbstract {

	/**
	 * Source Web page URL (should provide either HTML or XML content)
	 * You can specify any website URL which serves data suited for display in RSS feeds
	 * (for example a news blog).
	 *
	 * Use {@see XPathAbstract::getSourceUrl()} to read this parameter
	 */
	const FEED_SOURCE_URL = '';

	/**
	 * XPath expression for extracting the feed title from the source page.
	 * If this is left blank or does not provide any data {@see BridgeAbstract::getName()}
	 * is used instead as the feed's title.
	 *
	 * Use {@see XPathAbstract::getExpressionTitle()} to read this parameter
	 */
	const XPATH_EXPRESSION_FEED_TITLE = './/title';

	/**
	 * XPath expression for extracting the feed favicon URL from the source page.
	 * If this is left blank or does not provide any data {@see BridgeAbstract::getIcon()}
	 * is used instead as the feed's favicon URL.
	 *
	 * Use {@see XPathAbstract::getExpressionIcon()} to read this parameter
	 */
	const XPATH_EXPRESSION_FEED_ICON = './/link[@rel="icon"]/@href';

	/**
	 * XPath expression for extracting the feed items from the source page
	 * Enter an XPath expression matching a list of dom nodes, each node containing one
	 * feed article item in total (usually a surrounding <div> or <span> tag). This will
	 * be the context nodes for all of the following expressions. This expression usually
	 * starts with a single forward slash.
	 *
	 * Use {@see XPathAbstract::getExpressionItem()} to read this parameter
	 */
	const XPATH_EXPRESSION_ITEM = '';

	/**
	 * XPath expression for extracting an item title from the item context
	 * This expression should match a node contained within each article item node
	 * containing the article headline. It should start with a dot followed by two
	 * forward slashes, referring to any descendant nodes of the article item node.
	 *
	 * Use {@see XPathAbstract::getExpressionItemTitle()} to read this parameter
	 */
	const XPATH_EXPRESSION_ITEM_TITLE = '';

	/**
	 * XPath expression for extracting an item's content from the item context
	 * This expression should match a node contained within each article item node
	 * containing the article content or description. It should start with a dot
	 * followed by two forward slashes, referring to any descendant nodes of the
	 * article item node.
	 *
	 * Use {@see XPathAbstract::getExpressionItemContent()} to read this parameter
	 */
	const XPATH_EXPRESSION_ITEM_CONTENT = '';

	/**
	 * XPath expression for extracting an item link from the item context
	 * This expression should match a node's attribute containing the article URL
	 * (usually the href attribute of an <a> tag). It should start with a dot
	 * followed by two forward slashes, referring to any descendant nodes of
	 * the article item node. Attributes can be selected by prepending an @ char
	 * before the attributes name.
	 *
	 * Use {@see XPathAbstract::getExpressionItemUri()} to read this parameter
	 */
	const XPATH_EXPRESSION_ITEM_URI = '';

	/**
	 * XPath expression for extracting an item author from the item context
	 * This expression should match a node contained within each article item
	 * node containing the article author's name. It should start with a dot
	 * followed by two forward slashes, referring to any descendant nodes of
	 * the article item node.
	 *
	 * Use {@see XPathAbstract::getExpressionItemAuthor()} to read this parameter
	 */
	const XPATH_EXPRESSION_ITEM_AUTHOR = '';

	/**
	 * XPath expression for extracting an item timestamp from the item context
	 * This expression should match a node or node's attribute containing the
	 * article timestamp or date (parsable by PHP's strtotime function). It
	 * should start with a dot followed by two forward slashes, referring to
	 * any descendant nodes of the article item node. Attributes can be
	 * selected by prepending an @ char before the attributes name.
	 *
	 * Use {@see XPathAbstract::getExpressionItemTimestamp()} to read this parameter
	 */
	const XPATH_EXPRESSION_ITEM_TIMESTAMP = '';

	/**
	 * XPath expression for extracting item enclosures (media content like
	 * images or movies) from the item context
	 * This expression should match a node's attribute containing an article
	 * image URL (usually the src attribute of an <img> tag or a style
	 * attribute). It should start with a dot followed by two forward slashes,
	 * referring to any descendant nodes of the article item node. Attributes
	 * can be selected by prepending an @ char before the attributes name.
	 *
	 * Use {@see XPathAbstract::getExpressionItemEnclosures()} to read this parameter
	 */
	const XPATH_EXPRESSION_ITEM_ENCLOSURES = '';

	/**
	 * XPath expression for extracting an item category from the item context
	 * This expression should match a node or node's attribute contained
	 * within each article item node containing the article category. This
	 * could be inside <div> or <span> tags or sometimes be hidden
	 * in a data attribute. It should start with a dot followed by two
	 * forward slashes, referring to any descendant nodes of the article
	 * item node. Attributes can be selected by prepending an @ char
	 * before the attributes name.
	 *
	 * Use {@see XPathAbstract::getExpressionItemCategories()} to read this parameter
	 */
	const XPATH_EXPRESSION_ITEM_CATEGORIES = '';

	/**
	 * Fix encoding
	 * Set this to true for fixing feed encoding by invoking PHP's utf8_decode
	 * function on all extracted texts. Try this in case you see "broken" or
	 * "weird" characters in your feed where you'd normally expect umlauts
	 * or any other non-ascii characters.
	 *
	 * Use {@see XPathAbstract::getSettingFixEncoding()} to read this parameter
	 */
	const SETTING_FIX_ENCODING = false;

	/**
	 * Internal storage for resulting feed name, automatically detected
	 * @var string
	 */
	private $feedName;

	/**
	 * Internal storage for resulting feed name, automatically detected
	 * @var string
	 */
	private $feedUri;

	/**
	 * Internal storage for resulting feed favicon, automatically detected
	 * @var string
	 */
	private $feedIcon;

	public function getName(){
		return $this->feedName ?: parent::getName();
	}

	public function getURI() {
		return $this->feedUri ?: parent::getURI();
	}

	public function getIcon() {
		return $this->feedIcon ?: parent::getIcon();
	}

	/**
	 * Source Web page URL (should provide either HTML or XML content)
	 * @return string
	 */
	protected function getSourceUrl(){
		return static::FEED_SOURCE_URL;
	}

	/**
	 * XPath expression for extracting the feed title from the source page
	 * @return string
	 */
	protected function getExpressionTitle(){
		return static::XPATH_EXPRESSION_FEED_TITLE;
	}

	/**
	 * XPath expression for extracting the feed favicon from the source page
	 * @return string
	 */
	protected function getExpressionIcon(){
		return static::XPATH_EXPRESSION_FEED_ICON;
	}

	/**
	 * XPath expression for extracting the feed items from the source page
	 * @return string
	 */
	protected function getExpressionItem(){
		return static::XPATH_EXPRESSION_ITEM;
	}

	/**
	 * XPath expression for extracting an item title from the item context
	 * @return string
	 */
	protected function getExpressionItemTitle(){
		return static::XPATH_EXPRESSION_ITEM_TITLE;
	}

	/**
	 * XPath expression for extracting an item's content from the item context
	 * @return string
	 */
	protected function getExpressionItemContent(){
		return static::XPATH_EXPRESSION_ITEM_CONTENT;
	}

	/**
	 * XPath expression for extracting an item link from the item context
	 * @return string
	 */
	protected function getExpressionItemUri(){
		return static::XPATH_EXPRESSION_ITEM_URI;
	}

	/**
	 * XPath expression for extracting an item author from the item context
	 * @return string
	 */
	protected function getExpressionItemAuthor(){
		return static::XPATH_EXPRESSION_ITEM_AUTHOR;
	}

	/**
	 * XPath expression for extracting an item timestamp from the item context
	 * @return string
	 */
	protected function getExpressionItemTimestamp(){
		return static::XPATH_EXPRESSION_ITEM_TIMESTAMP;
	}

	/**
	 * XPath expression for extracting item enclosures (media content like
	 * images or movies) from the item context
	 * @return string
	 */
	protected function getExpressionItemEnclosures(){
		return static::XPATH_EXPRESSION_ITEM_ENCLOSURES;
	}

	/**
	 * XPath expression for extracting an item category from the item context
	 * @return string
	 */
	protected function getExpressionItemCategories(){
		return static::XPATH_EXPRESSION_ITEM_CATEGORIES;
	}

	/**
	 * Fix encoding
	 * @return string
	 */
	protected function getSettingFixEncoding(){
		return static::SETTING_FIX_ENCODING;
	}

	/**
	 * Internal helper method for quickly accessing all the user defined constants
	 * in derived classes
	 *
	 * @param $name
	 * @return bool|string
	 */
	private function getParam($name){
		switch($name) {

			case 'url':
				return $this->getSourceUrl();
			case 'feed_title':
				return $this->getExpressionTitle();
			case 'feed_icon':
				return $this->getExpressionIcon();
			case 'item':
				return $this->getExpressionItem();
			case 'title':
				return $this->getExpressionItemTitle();
			case 'content':
				return $this->getExpressionItemContent();
			case 'uri':
				return $this->getExpressionItemUri();
			case 'author':
				return $this->getExpressionItemAuthor();
			case 'timestamp':
				return $this->getExpressionItemTimestamp();
			case 'enclosures':
				return $this->getExpressionItemEnclosures();
			case 'categories':
				return $this->getExpressionItemCategories();
			case 'fix_encoding':
				return $this->getSettingFixEncoding();
		}
	}

	/**
	 * Should provide the source website HTML content
	 * can be easily overwritten for example if special headers or auth infos are required
	 * @return string
	 */
	protected function provideWebsiteContent() {
		return getContents($this->feedUri);
	}

	/**
	 * Should provide the feeds title
	 *
	 * @param DOMXPath $xpath
	 * @return string
	 */
	protected function provideFeedTitle(DOMXPath $xpath) {
		$title = $xpath->query($this->getParam('feed_title'));
		if(count($title) === 1) {
			return $this->getItemValueOrNodeValue($title);
		}
	}

	/**
	 * Should provide the URL of the feed's favicon
	 *
	 * @param DOMXPath $xpath
	 * @return string
	 */
	protected function provideFeedIcon(DOMXPath $xpath) {
		$icon = $xpath->query($this->getParam('feed_icon'));
		if(count($icon) === 1) {
			return $this->cleanImageUrl($this->getItemValueOrNodeValue($icon));
		}
	}

	/**
	 * Should provide the feed's items.
	 *
	 * @param DOMXPath $xpath
	 * @return DOMNodeList
	 */
	protected function provideFeedItems(DOMXPath $xpath) {
		return @$xpath->query($this->getParam('item'));
	}

	public function collectData() {

		$this->feedUri = $this->getParam('url');

		$webPageHtml = new DOMDocument();
		libxml_use_internal_errors(true);
		$webPageHtml->loadHTML($this->provideWebsiteContent());
		libxml_clear_errors();
		libxml_use_internal_errors(false);

		$xpath = new DOMXPath($webPageHtml);

		$this->feedName = $this->provideFeedTitle($xpath);
		$this->feedIcon = $this->provideFeedIcon($xpath);

		$entries = $this->provideFeedItems($xpath);
		if($entries === false) {
			return;
		}

		foreach ($entries as $entry) {
			$item = new \FeedItem();
			foreach(array('title', 'content', 'uri', 'author', 'timestamp', 'enclosures', 'categories') as $param) {

				$expression = $this->getParam($param);
				if('' === $expression) {
					continue;
				}

				//can be a string or DOMNodeList, depending on the expression result
				$typedResult = @$xpath->evaluate($expression, $entry);
				if ($typedResult === false || ($typedResult instanceof DOMNodeList && count($typedResult) === 0)
					|| (is_string($typedResult) && strlen(trim($typedResult)) === 0)) {
					continue;
				}

				$item->__set($param, $this->formatParamValue($param, $this->getItemValueOrNodeValue($typedResult)));

			}

			$itemId = $this->generateItemId($item);
			if(null !== $itemId) {
				$item->setUid($itemId);
			}

			$this->items[] = $item;
		}

	}

	/**
	 * @param $param
	 * @param $value
	 * @return string|array
	 */
	protected function formatParamValue($param, $value)
	{
		$value = $this->fixEncoding($value);
		switch ($param) {
			case 'title':
				return $this->formatItemTitle($value);
			case 'content':
				return $this->formatItemContent($value);
			case 'uri':
				return $this->formatItemUri($value);
			case 'author':
				return $this->formatItemAuthor($value);
			case 'timestamp':
				return $this->formatItemTimestamp($value);
			case 'enclosures':
				return array($this->cleanImageUrl($value));
			case 'categories':
				return array($this->fixEncoding($value));
		}
		return $value;
	}

	/**
	 * Formats the title of a feed item. Takes extracted raw title and returns it formatted
	 * as string.
	 * Can be easily overwritten for in case the value needs to be transformed into something
	 * else.
	 * @param string $value
	 * @return string
	 */
	protected function formatItemTitle($value) {
		return $value;
	}

	/**
	 * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix
	 * timestamp as integer.
	 * Can be easily overwritten for example if a special format has to be expected on the
	 * source website.
	 * @param string $value
	 * @return string
	 */
	protected function formatItemContent($value) {
		return $value;
	}

	/**
	 * Formats the URI of a feed item. Takes extracted raw URI and returns it formatted
	 * as string.
	 * Can be easily overwritten for in case the value needs to be transformed into something
	 * else.
	 * @param string $value
	 * @return string
	 */
	protected function formatItemUri($value) {
		if(strlen($value) === 0) {
			return '';
		}
		if(strpos($value, 'http://') === 0 || strpos($value, 'https://') === 0) {
			return $value;
		}

		return urljoin($this->feedUri, $value);
	}

	/**
	 * Formats the author of a feed item. Takes extracted raw author and returns it formatted
	 * as string.
	 * Can be easily overwritten for in case the value needs to be transformed into something
	 * else.
	 * @param string $value
	 * @return string
	 */
	protected function formatItemAuthor($value) {
		return $value;
	}

	/**
	 * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix
	 * timestamp as integer.
	 * Can be easily overwritten for example if a special format has to be expected on the
	 * source website.
	 * @param string $value
	 * @return false|int
	 */
	protected function formatItemTimestamp($value) {
		return strtotime($value);
	}

	/**
	 * Formats the enclosures of a feed item. Takes extracted raw enclosures and returns them
	 * formatted as array.
	 * Can be easily overwritten for in case the values need to be transformed into something
	 * else.
	 * @param string $value
	 * @return array
	 */
	protected function formatItemEnclosures($value) {
		return array($this->cleanImageUrl($value));
	}

	/**
	 * Formats the categories of a feed item. Takes extracted raw categories and returns them
	 * formatted as array.
	 * Can be easily overwritten for in case the values need to be transformed into something
	 * else.
	 * @param string $value
	 * @return array
	 */
	protected function formatItemCategories($value) {
		return array($value);
	}

	/**
	 * @param $imageUrl
	 * @return string|void
	 */
	protected function cleanImageUrl($imageUrl)
	{
		$result = preg_match('~(?:http(?:s)?:)?[\/a-zA-Z0-9\-_\.]+\.(?:jpg|gif|png|jpeg|ico){1}~', $imageUrl, $matches);
		if(1 !== $result) {
			return;
		}
		return urljoin($this->feedUri, $matches[0]);
	}

	/**
	 * @param $typedResult
	 * @return string
	 */
	protected function getItemValueOrNodeValue($typedResult)
	{
		if($typedResult instanceof DOMNodeList) {
			$item = $typedResult->item(0);
			if ($item instanceof DOMElement) {
				return trim($item->nodeValue);
			} elseif ($item instanceof DOMAttr) {
				return trim($item->value);
			}
		} elseif(is_string($typedResult) && strlen($typedResult) > 0) {
			return trim($typedResult);
		}
		returnServerError('Unknown type of XPath expression result.');
	}

	/**
	 * Fixes feed encoding by invoking PHP's utf8_decode function on extracted texts.
	 * Useful in case of "broken" or "weird" characters in the feed where you'd normally
	 * expect umlauts.
	 *
	 * @param $input
	 * @return string
	 */
	protected function fixEncoding($input)
	{
		return $this->getParam('fix_encoding') ? utf8_decode($input) : $input;
	}

	/**
	 * Allows overriding default mechanism determining items Uid's
	 *
	 * @param FeedItem $item
	 * @return string|null
	 */
	protected function generateItemId(\FeedItem $item) {
		return null; //auto generation
	}
}