[BridgeXPathAbstract + BlizzardNewsBridge + XPathBridge] Add new abstract class + two example implementations (#1671)
This commit is contained in:
parent
d05a8b79fe
commit
3ad138026d
5 changed files with 897 additions and 1 deletions
60
bridges/BlizzardNewsBridge.php
Normal file
60
bridges/BlizzardNewsBridge.php
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
class BlizzardNewsBridge extends XPathAbstract {
|
||||||
|
|
||||||
|
const NAME = 'Blizzard News';
|
||||||
|
const URI = 'https://news.blizzard.com';
|
||||||
|
const DESCRIPTION = 'Blizzard (game company) newsfeed';
|
||||||
|
const MAINTAINER = 'Niehztog';
|
||||||
|
const PARAMETERS = array(
|
||||||
|
'' => array(
|
||||||
|
'locale' => array(
|
||||||
|
'name' => 'Language',
|
||||||
|
'type' => 'list',
|
||||||
|
'values' => array(
|
||||||
|
'Deutsch' => 'de-de',
|
||||||
|
'English (EU)' => 'en-gb',
|
||||||
|
'English (US)' => 'en-us',
|
||||||
|
'Español (EU)' => 'es-es',
|
||||||
|
'Español (AL)' => 'es-mx',
|
||||||
|
'Français' => 'fr-fr',
|
||||||
|
'Italiano' => 'it-it',
|
||||||
|
'日本語' => 'ja-jp',
|
||||||
|
'한국어' => 'ko-kr',
|
||||||
|
'Polski' => 'pl-pl',
|
||||||
|
'Português (AL)' => 'pt-br',
|
||||||
|
'Русский' => 'ru-ru',
|
||||||
|
'ภาษาไทย' => 'th-th',
|
||||||
|
'简体中文' => 'zh-cn',
|
||||||
|
'繁體中文' => 'zh-tw'
|
||||||
|
),
|
||||||
|
'defaultValue' => 'en-us',
|
||||||
|
'title' => 'Select your language'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
const CACHE_TIMEOUT = 3600;
|
||||||
|
|
||||||
|
const XPATH_EXPRESSION_ITEM = '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article';
|
||||||
|
const XPATH_EXPRESSION_ITEM_TITLE = './/div/div[2]/h2';
|
||||||
|
const XPATH_EXPRESSION_ITEM_CONTENT = './/div[@class="ArticleListItem-description"]/div[@class="h6"]';
|
||||||
|
const XPATH_EXPRESSION_ITEM_URI = './/a[@class="ArticleLink ArticleLink"]/@href';
|
||||||
|
const XPATH_EXPRESSION_ITEM_AUTHOR = '';
|
||||||
|
const XPATH_EXPRESSION_ITEM_TIMESTAMP = './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp';
|
||||||
|
const XPATH_EXPRESSION_ITEM_ENCLOSURES = './/div[@class="ArticleListItem-image"]/@style';
|
||||||
|
const XPATH_EXPRESSION_ITEM_CATEGORIES = './/div[@class="ArticleListItem-label"]';
|
||||||
|
const SETTING_FIX_ENCODING = true;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Source Web page URL (should provide either HTML or XML content)
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getSourceUrl(){
|
||||||
|
|
||||||
|
$locale = $this->getInput('locale');
|
||||||
|
if('zh-cn' === $locale) {
|
||||||
|
return 'https://cn.news.blizzard.com';
|
||||||
|
}
|
||||||
|
return 'https://news.blizzard.com/' . $locale;
|
||||||
|
}
|
||||||
|
}
|
251
bridges/XPathBridge.php
Normal file
251
bridges/XPathBridge.php
Normal file
|
@ -0,0 +1,251 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
class XPathBridge extends XPathAbstract {
|
||||||
|
const NAME = 'XPathBridge';
|
||||||
|
const URI = 'https://github.com/rss-bridge/rss-bridge';
|
||||||
|
const DESCRIPTION
|
||||||
|
= 'Parse any webpage using <a href="https://devhints.io/xpath" target="_blank">XPath expressions</a>';
|
||||||
|
const MAINTAINER = 'Niehztog';
|
||||||
|
const PARAMETERS = array(
|
||||||
|
'' => array(
|
||||||
|
|
||||||
|
'url' => array(
|
||||||
|
'name' => 'Enter web page URL',
|
||||||
|
'title' => <<<"EOL"
|
||||||
|
You can specify any website URL which serves data suited for display in RSS feeds
|
||||||
|
(for example a news blog).
|
||||||
|
EOL
|
||||||
|
, 'type' => 'text',
|
||||||
|
'exampleValue' => 'https://news.blizzard.com/en-en',
|
||||||
|
'defaultValue' => 'https://news.blizzard.com/en-en',
|
||||||
|
'required' => true
|
||||||
|
),
|
||||||
|
|
||||||
|
'item' => array(
|
||||||
|
'name' => 'Item selector',
|
||||||
|
'title' => <<<"EOL"
|
||||||
|
Enter an XPath expression matching a list of dom nodes, each node containing one
|
||||||
|
feed article item in total (usually a surrounding <div> or <span> tag). This will
|
||||||
|
be the context nodes for all of the following expressions. This expression usually
|
||||||
|
starts with a single forward slash.
|
||||||
|
EOL
|
||||||
|
, 'type' => 'text',
|
||||||
|
'exampleValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article',
|
||||||
|
'defaultValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article',
|
||||||
|
'required' => true
|
||||||
|
),
|
||||||
|
|
||||||
|
'title' => array(
|
||||||
|
'name' => 'Item title selector',
|
||||||
|
'title' => <<<"EOL"
|
||||||
|
This expression should match a node contained within each article item node
|
||||||
|
containing the article headline. It should start with a dot followed by two
|
||||||
|
forward slashes, referring to any descendant nodes of the article item node.
|
||||||
|
EOL
|
||||||
|
, 'type' => 'text',
|
||||||
|
'exampleValue' => './/div/div[2]/h2',
|
||||||
|
'defaultValue' => './/div/div[2]/h2',
|
||||||
|
'required' => true
|
||||||
|
),
|
||||||
|
|
||||||
|
'content' => array(
|
||||||
|
'name' => 'Item description selector',
|
||||||
|
'title' => <<<"EOL"
|
||||||
|
This expression should match a node contained within each article item node
|
||||||
|
containing the article content or description. It should start with a dot
|
||||||
|
followed by two forward slashes, referring to any descendant nodes of the
|
||||||
|
article item node.
|
||||||
|
EOL
|
||||||
|
, 'type' => 'text',
|
||||||
|
'exampleValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]',
|
||||||
|
'defaultValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]',
|
||||||
|
'required' => false
|
||||||
|
),
|
||||||
|
|
||||||
|
'uri' => array(
|
||||||
|
'name' => 'Item URL selector',
|
||||||
|
'title' => <<<"EOL"
|
||||||
|
This expression should match a node's attribute containing the article URL
|
||||||
|
(usually the href attribute of an <a> tag). It should start with a dot
|
||||||
|
followed by two forward slashes, referring to any descendant nodes of
|
||||||
|
the article item node. Attributes can be selected by prepending an @ char
|
||||||
|
before the attributes name.
|
||||||
|
EOL
|
||||||
|
, 'type' => 'text',
|
||||||
|
'exampleValue' => './/a[@class="ArticleLink ArticleLink"]/@href',
|
||||||
|
'defaultValue' => './/a[@class="ArticleLink ArticleLink"]/@href',
|
||||||
|
'required' => false
|
||||||
|
),
|
||||||
|
|
||||||
|
'author' => array(
|
||||||
|
'name' => 'Item author selector',
|
||||||
|
'title' => <<<"EOL"
|
||||||
|
This expression should match a node contained within each article item
|
||||||
|
node containing the article author's name. It should start with a dot
|
||||||
|
followed by two forward slashes, referring to any descendant nodes of
|
||||||
|
the article item node.
|
||||||
|
EOL
|
||||||
|
, 'type' => 'text',
|
||||||
|
'required' => false
|
||||||
|
),
|
||||||
|
|
||||||
|
'timestamp' => array(
|
||||||
|
'name' => 'Item date selector',
|
||||||
|
'title' => <<<"EOL"
|
||||||
|
This expression should match a node or node's attribute containing the
|
||||||
|
article timestamp or date (parsable by PHP's strtotime function). It
|
||||||
|
should start with a dot followed by two forward slashes, referring to
|
||||||
|
any descendant nodes of the article item node. Attributes can be
|
||||||
|
selected by prepending an @ char before the attributes name.
|
||||||
|
EOL
|
||||||
|
, 'type' => 'text',
|
||||||
|
'exampleValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp',
|
||||||
|
'defaultValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp',
|
||||||
|
'required' => false
|
||||||
|
),
|
||||||
|
|
||||||
|
'enclosures' => array(
|
||||||
|
'name' => 'Item image selector',
|
||||||
|
'title' => <<<"EOL"
|
||||||
|
This expression should match a node's attribute containing an article
|
||||||
|
image URL (usually the src attribute of an <img> tag or a style
|
||||||
|
attribute). It should start with a dot followed by two forward slashes,
|
||||||
|
referring to any descendant nodes of the article item node. Attributes
|
||||||
|
can be selected by prepending an @ char before the attributes name.
|
||||||
|
EOL
|
||||||
|
, 'type' => 'text',
|
||||||
|
'exampleValue' => './/div[@class="ArticleListItem-image"]/@style',
|
||||||
|
'defaultValue' => './/div[@class="ArticleListItem-image"]/@style',
|
||||||
|
'required' => false
|
||||||
|
),
|
||||||
|
|
||||||
|
'categories' => array(
|
||||||
|
'name' => 'Item category selector',
|
||||||
|
'title' => <<<"EOL"
|
||||||
|
This expression should match a node or node's attribute contained
|
||||||
|
within each article item node containing the article category. This
|
||||||
|
could be inside <div> or <span> tags or sometimes be hidden
|
||||||
|
in a data attribute. It should start with a dot followed by two
|
||||||
|
forward slashes, referring to any descendant nodes of the article
|
||||||
|
item node. Attributes can be selected by prepending an @ char
|
||||||
|
before the attributes name.
|
||||||
|
EOL
|
||||||
|
, 'type' => 'text',
|
||||||
|
'exampleValue' => './/div[@class="ArticleListItem-label"]',
|
||||||
|
'defaultValue' => './/div[@class="ArticleListItem-label"]',
|
||||||
|
'required' => false
|
||||||
|
),
|
||||||
|
|
||||||
|
'fix_encoding' => array(
|
||||||
|
'name' => 'Fix encoding',
|
||||||
|
'title' => <<<"EOL"
|
||||||
|
Check this to fix feed encoding by invoking PHP's utf8_decode
|
||||||
|
function on all extracted texts. Try this in case you see "broken" or
|
||||||
|
"weird" characters in your feed where you'd normally expect umlauts
|
||||||
|
or any other non-ascii characters.
|
||||||
|
EOL
|
||||||
|
, 'type' => 'checkbox',
|
||||||
|
'required' => false
|
||||||
|
),
|
||||||
|
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Source Web page URL (should provide either HTML or XML content)
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getSourceUrl(){
|
||||||
|
return $this->encodeUri($this->getInput('url'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting the feed items from the source page
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItem(){
|
||||||
|
return urldecode($this->getInput('item'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item title from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemTitle(){
|
||||||
|
return urldecode($this->getInput('title'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item's content from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemContent(){
|
||||||
|
return urldecode($this->getInput('content'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item link from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemUri(){
|
||||||
|
return urldecode($this->getInput('uri'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item author from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemAuthor(){
|
||||||
|
return urldecode($this->getInput('author'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item timestamp from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemTimestamp(){
|
||||||
|
return urldecode($this->getInput('timestamp'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting item enclosures (media content like
|
||||||
|
* images or movies) from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemEnclosures(){
|
||||||
|
return urldecode($this->getInput('enclosures'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item category from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemCategories(){
|
||||||
|
return urldecode($this->getInput('categories'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fix encoding
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getSettingFixEncoding(){
|
||||||
|
return $this->getInput('fix_encoding');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fixes URL encoding issues in input URL's
|
||||||
|
* @param $uri
|
||||||
|
* @return string|string[]
|
||||||
|
*/
|
||||||
|
private function encodeUri($uri)
|
||||||
|
{
|
||||||
|
if (strpos($uri, 'https%3A%2F%2F') === 0
|
||||||
|
|| strpos($uri, 'http%3A%2F%2F') === 0) {
|
||||||
|
$uri = urldecode($uri);
|
||||||
|
}
|
||||||
|
|
||||||
|
$uri = str_replace('|', '%7C', $uri);
|
||||||
|
|
||||||
|
return $uri;
|
||||||
|
}
|
||||||
|
}
|
|
@ -34,6 +34,7 @@
|
||||||
},
|
},
|
||||||
"suggest": {
|
"suggest": {
|
||||||
"ext-memcached": "Allows to use memcached as cache type",
|
"ext-memcached": "Allows to use memcached as cache type",
|
||||||
"ext-sqlite3": "Allows to use an SQLite database for caching"
|
"ext-sqlite3": "Allows to use an SQLite database for caching",
|
||||||
|
"ext-dom": "Allows to use some bridges based on XPath expressions"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
583
lib/XPathAbstract.php
Normal file
583
lib/XPathAbstract.php
Normal file
|
@ -0,0 +1,583 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An alternative abstract class for bridges utilizing XPath expressions
|
||||||
|
*
|
||||||
|
* This class is meant as an alternative base class for bridge implementations.
|
||||||
|
* It offers preliminary functionality for generating feeds based on XPath
|
||||||
|
* expressions.
|
||||||
|
* As a minimum, extending classes should define XPath expressions pointing
|
||||||
|
* to the feed items contents in the class constants below. In case there is
|
||||||
|
* more manual fine tuning required, it offers a bunch of methods which can
|
||||||
|
* be overridden, for example in order to specify formatting of field values
|
||||||
|
* or more flexible definition of dynamic XPath expressions.
|
||||||
|
*
|
||||||
|
* This class extends {@see BridgeAbstract}, which means it incorporates and
|
||||||
|
* extends all of its functionality.
|
||||||
|
**/
|
||||||
|
abstract class XPathAbstract extends BridgeAbstract {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Source Web page URL (should provide either HTML or XML content)
|
||||||
|
* You can specify any website URL which serves data suited for display in RSS feeds
|
||||||
|
* (for example a news blog).
|
||||||
|
*
|
||||||
|
* Use {@see XPathAbstract::getSourceUrl()} to read this parameter
|
||||||
|
*/
|
||||||
|
const FEED_SOURCE_URL = '';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting the feed title from the source page.
|
||||||
|
* If this is left blank or does not provide any data {@see BridgeAbstract::getName()}
|
||||||
|
* is used instead as the feed's title.
|
||||||
|
*
|
||||||
|
* Use {@see XPathAbstract::getExpressionTitle()} to read this parameter
|
||||||
|
*/
|
||||||
|
const XPATH_EXPRESSION_FEED_TITLE = './/title';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting the feed favicon URL from the source page.
|
||||||
|
* If this is left blank or does not provide any data {@see BridgeAbstract::getIcon()}
|
||||||
|
* is used instead as the feed's favicon URL.
|
||||||
|
*
|
||||||
|
* Use {@see XPathAbstract::getExpressionIcon()} to read this parameter
|
||||||
|
*/
|
||||||
|
const XPATH_EXPRESSION_FEED_ICON = './/link[@rel="icon"]/@href';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting the feed items from the source page
|
||||||
|
* Enter an XPath expression matching a list of dom nodes, each node containing one
|
||||||
|
* feed article item in total (usually a surrounding <div> or <span> tag). This will
|
||||||
|
* be the context nodes for all of the following expressions. This expression usually
|
||||||
|
* starts with a single forward slash.
|
||||||
|
*
|
||||||
|
* Use {@see XPathAbstract::getExpressionItem()} to read this parameter
|
||||||
|
*/
|
||||||
|
const XPATH_EXPRESSION_ITEM = '';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item title from the item context
|
||||||
|
* This expression should match a node contained within each article item node
|
||||||
|
* containing the article headline. It should start with a dot followed by two
|
||||||
|
* forward slashes, referring to any descendant nodes of the article item node.
|
||||||
|
*
|
||||||
|
* Use {@see XPathAbstract::getExpressionItemTitle()} to read this parameter
|
||||||
|
*/
|
||||||
|
const XPATH_EXPRESSION_ITEM_TITLE = '';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item's content from the item context
|
||||||
|
* This expression should match a node contained within each article item node
|
||||||
|
* containing the article content or description. It should start with a dot
|
||||||
|
* followed by two forward slashes, referring to any descendant nodes of the
|
||||||
|
* article item node.
|
||||||
|
*
|
||||||
|
* Use {@see XPathAbstract::getExpressionItemContent()} to read this parameter
|
||||||
|
*/
|
||||||
|
const XPATH_EXPRESSION_ITEM_CONTENT = '';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item link from the item context
|
||||||
|
* This expression should match a node's attribute containing the article URL
|
||||||
|
* (usually the href attribute of an <a> tag). It should start with a dot
|
||||||
|
* followed by two forward slashes, referring to any descendant nodes of
|
||||||
|
* the article item node. Attributes can be selected by prepending an @ char
|
||||||
|
* before the attributes name.
|
||||||
|
*
|
||||||
|
* Use {@see XPathAbstract::getExpressionItemUri()} to read this parameter
|
||||||
|
*/
|
||||||
|
const XPATH_EXPRESSION_ITEM_URI = '';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item author from the item context
|
||||||
|
* This expression should match a node contained within each article item
|
||||||
|
* node containing the article author's name. It should start with a dot
|
||||||
|
* followed by two forward slashes, referring to any descendant nodes of
|
||||||
|
* the article item node.
|
||||||
|
*
|
||||||
|
* Use {@see XPathAbstract::getExpressionItemAuthor()} to read this parameter
|
||||||
|
*/
|
||||||
|
const XPATH_EXPRESSION_ITEM_AUTHOR = '';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item timestamp from the item context
|
||||||
|
* This expression should match a node or node's attribute containing the
|
||||||
|
* article timestamp or date (parsable by PHP's strtotime function). It
|
||||||
|
* should start with a dot followed by two forward slashes, referring to
|
||||||
|
* any descendant nodes of the article item node. Attributes can be
|
||||||
|
* selected by prepending an @ char before the attributes name.
|
||||||
|
*
|
||||||
|
* Use {@see XPathAbstract::getExpressionItemTimestamp()} to read this parameter
|
||||||
|
*/
|
||||||
|
const XPATH_EXPRESSION_ITEM_TIMESTAMP = '';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting item enclosures (media content like
|
||||||
|
* images or movies) from the item context
|
||||||
|
* This expression should match a node's attribute containing an article
|
||||||
|
* image URL (usually the src attribute of an <img> tag or a style
|
||||||
|
* attribute). It should start with a dot followed by two forward slashes,
|
||||||
|
* referring to any descendant nodes of the article item node. Attributes
|
||||||
|
* can be selected by prepending an @ char before the attributes name.
|
||||||
|
*
|
||||||
|
* Use {@see XPathAbstract::getExpressionItemEnclosures()} to read this parameter
|
||||||
|
*/
|
||||||
|
const XPATH_EXPRESSION_ITEM_ENCLOSURES = '';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item category from the item context
|
||||||
|
* This expression should match a node or node's attribute contained
|
||||||
|
* within each article item node containing the article category. This
|
||||||
|
* could be inside <div> or <span> tags or sometimes be hidden
|
||||||
|
* in a data attribute. It should start with a dot followed by two
|
||||||
|
* forward slashes, referring to any descendant nodes of the article
|
||||||
|
* item node. Attributes can be selected by prepending an @ char
|
||||||
|
* before the attributes name.
|
||||||
|
*
|
||||||
|
* Use {@see XPathAbstract::getExpressionItemCategories()} to read this parameter
|
||||||
|
*/
|
||||||
|
const XPATH_EXPRESSION_ITEM_CATEGORIES = '';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fix encoding
|
||||||
|
* Set this to true for fixing feed encoding by invoking PHP's utf8_decode
|
||||||
|
* function on all extracted texts. Try this in case you see "broken" or
|
||||||
|
* "weird" characters in your feed where you'd normally expect umlauts
|
||||||
|
* or any other non-ascii characters.
|
||||||
|
*
|
||||||
|
* Use {@see XPathAbstract::getSettingFixEncoding()} to read this parameter
|
||||||
|
*/
|
||||||
|
const SETTING_FIX_ENCODING = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal storage for resulting feed name, automatically detected
|
||||||
|
* @var string
|
||||||
|
*/
|
||||||
|
private $feedName;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal storage for resulting feed name, automatically detected
|
||||||
|
* @var string
|
||||||
|
*/
|
||||||
|
private $feedUri;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal storage for resulting feed favicon, automatically detected
|
||||||
|
* @var string
|
||||||
|
*/
|
||||||
|
private $feedIcon;
|
||||||
|
|
||||||
|
public function getName(){
|
||||||
|
return $this->feedName ?: parent::getName();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getURI() {
|
||||||
|
return $this->feedUri ?: parent::getURI();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getIcon() {
|
||||||
|
return $this->feedIcon ?: parent::getIcon();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Source Web page URL (should provide either HTML or XML content)
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getSourceUrl(){
|
||||||
|
return static::FEED_SOURCE_URL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting the feed title from the source page
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionTitle(){
|
||||||
|
return static::XPATH_EXPRESSION_FEED_TITLE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting the feed favicon from the source page
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionIcon(){
|
||||||
|
return static::XPATH_EXPRESSION_FEED_ICON;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting the feed items from the source page
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItem(){
|
||||||
|
return static::XPATH_EXPRESSION_ITEM;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item title from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemTitle(){
|
||||||
|
return static::XPATH_EXPRESSION_ITEM_TITLE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item's content from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemContent(){
|
||||||
|
return static::XPATH_EXPRESSION_ITEM_CONTENT;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item link from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemUri(){
|
||||||
|
return static::XPATH_EXPRESSION_ITEM_URI;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item author from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemAuthor(){
|
||||||
|
return static::XPATH_EXPRESSION_ITEM_AUTHOR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item timestamp from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemTimestamp(){
|
||||||
|
return static::XPATH_EXPRESSION_ITEM_TIMESTAMP;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting item enclosures (media content like
|
||||||
|
* images or movies) from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemEnclosures(){
|
||||||
|
return static::XPATH_EXPRESSION_ITEM_ENCLOSURES;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath expression for extracting an item category from the item context
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getExpressionItemCategories(){
|
||||||
|
return static::XPATH_EXPRESSION_ITEM_CATEGORIES;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fix encoding
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getSettingFixEncoding(){
|
||||||
|
return static::SETTING_FIX_ENCODING;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal helper method for quickly accessing all the user defined constants
|
||||||
|
* in derived classes
|
||||||
|
*
|
||||||
|
* @param $name
|
||||||
|
* @return bool|string
|
||||||
|
*/
|
||||||
|
private function getParam($name){
|
||||||
|
switch($name) {
|
||||||
|
|
||||||
|
case 'url':
|
||||||
|
return $this->getSourceUrl();
|
||||||
|
case 'feed_title':
|
||||||
|
return $this->getExpressionTitle();
|
||||||
|
case 'feed_icon':
|
||||||
|
return $this->getExpressionIcon();
|
||||||
|
case 'item':
|
||||||
|
return $this->getExpressionItem();
|
||||||
|
case 'title':
|
||||||
|
return $this->getExpressionItemTitle();
|
||||||
|
case 'content':
|
||||||
|
return $this->getExpressionItemContent();
|
||||||
|
case 'uri':
|
||||||
|
return $this->getExpressionItemUri();
|
||||||
|
case 'author':
|
||||||
|
return $this->getExpressionItemAuthor();
|
||||||
|
case 'timestamp':
|
||||||
|
return $this->getExpressionItemTimestamp();
|
||||||
|
case 'enclosures':
|
||||||
|
return $this->getExpressionItemEnclosures();
|
||||||
|
case 'categories':
|
||||||
|
return $this->getExpressionItemCategories();
|
||||||
|
case 'fix_encoding':
|
||||||
|
return $this->getSettingFixEncoding();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should provide the source website HTML content
|
||||||
|
* can be easily overwritten for example if special headers or auth infos are required
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function provideWebsiteContent() {
|
||||||
|
return getContents($this->feedUri);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should provide the feeds title
|
||||||
|
*
|
||||||
|
* @param DOMXPath $xpath
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function provideFeedTitle(DOMXPath $xpath) {
|
||||||
|
$title = $xpath->query($this->getParam('feed_title'));
|
||||||
|
if(count($title) === 1) {
|
||||||
|
return $this->getItemValueOrNodeValue($title);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should provide the URL of the feed's favicon
|
||||||
|
*
|
||||||
|
* @param DOMXPath $xpath
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function provideFeedIcon(DOMXPath $xpath) {
|
||||||
|
$icon = $xpath->query($this->getParam('feed_icon'));
|
||||||
|
if(count($icon) === 1) {
|
||||||
|
return $this->cleanImageUrl($this->getItemValueOrNodeValue($icon));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should provide the feed's items.
|
||||||
|
*
|
||||||
|
* @param DOMXPath $xpath
|
||||||
|
* @return DOMNodeList
|
||||||
|
*/
|
||||||
|
protected function provideFeedItems(DOMXPath $xpath) {
|
||||||
|
return @$xpath->query($this->getParam('item'));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function collectData() {
|
||||||
|
|
||||||
|
$this->feedUri = $this->getParam('url');
|
||||||
|
|
||||||
|
$webPageHtml = new DOMDocument();
|
||||||
|
libxml_use_internal_errors(true);
|
||||||
|
$webPageHtml->loadHTML($this->provideWebsiteContent());
|
||||||
|
libxml_clear_errors();
|
||||||
|
libxml_use_internal_errors(false);
|
||||||
|
|
||||||
|
$xpath = new DOMXPath($webPageHtml);
|
||||||
|
|
||||||
|
$this->feedName = $this->provideFeedTitle($xpath);
|
||||||
|
$this->feedIcon = $this->provideFeedIcon($xpath);
|
||||||
|
|
||||||
|
$entries = $this->provideFeedItems($xpath);
|
||||||
|
if($entries === false) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach ($entries as $entry) {
|
||||||
|
$item = new \FeedItem();
|
||||||
|
foreach(array('title', 'content', 'uri', 'author', 'timestamp', 'enclosures', 'categories') as $param) {
|
||||||
|
|
||||||
|
$expression = $this->getParam($param);
|
||||||
|
if('' === $expression) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
//can be a string or DOMNodeList, depending on the expression result
|
||||||
|
$typedResult = @$xpath->evaluate($expression, $entry);
|
||||||
|
if ($typedResult === false || ($typedResult instanceof DOMNodeList && count($typedResult) === 0)
|
||||||
|
|| (is_string($typedResult) && strlen(trim($typedResult)) === 0)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$item->__set($param, $this->formatParamValue($param, $this->getItemValueOrNodeValue($typedResult)));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
$itemId = $this->generateItemId($item);
|
||||||
|
if(null !== $itemId) {
|
||||||
|
$item->setUid($itemId);
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->items[] = $item;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param $param
|
||||||
|
* @param $value
|
||||||
|
* @return string|array
|
||||||
|
*/
|
||||||
|
protected function formatParamValue($param, $value)
|
||||||
|
{
|
||||||
|
$value = $this->fixEncoding($value);
|
||||||
|
switch ($param) {
|
||||||
|
case 'title':
|
||||||
|
return $this->formatItemTitle($value);
|
||||||
|
case 'content':
|
||||||
|
return $this->formatItemContent($value);
|
||||||
|
case 'uri':
|
||||||
|
return $this->formatItemUri($value);
|
||||||
|
case 'author':
|
||||||
|
return $this->formatItemAuthor($value);
|
||||||
|
case 'timestamp':
|
||||||
|
return $this->formatItemTimestamp($value);
|
||||||
|
case 'enclosures':
|
||||||
|
return array($this->cleanImageUrl($value));
|
||||||
|
case 'categories':
|
||||||
|
return array($this->fixEncoding($value));
|
||||||
|
}
|
||||||
|
return $value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Formats the title of a feed item. Takes extracted raw title and returns it formatted
|
||||||
|
* as string.
|
||||||
|
* Can be easily overwritten for in case the value needs to be transformed into something
|
||||||
|
* else.
|
||||||
|
* @param string $value
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function formatItemTitle($value) {
|
||||||
|
return $value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix
|
||||||
|
* timestamp as integer.
|
||||||
|
* Can be easily overwritten for example if a special format has to be expected on the
|
||||||
|
* source website.
|
||||||
|
* @param string $value
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function formatItemContent($value) {
|
||||||
|
return $value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Formats the URI of a feed item. Takes extracted raw URI and returns it formatted
|
||||||
|
* as string.
|
||||||
|
* Can be easily overwritten for in case the value needs to be transformed into something
|
||||||
|
* else.
|
||||||
|
* @param string $value
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function formatItemUri($value) {
|
||||||
|
if(strlen($value) === 0) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
if(strpos($value, 'http://') === 0 || strpos($value, 'https://') === 0) {
|
||||||
|
return $value;
|
||||||
|
}
|
||||||
|
|
||||||
|
return urljoin($this->feedUri, $value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Formats the author of a feed item. Takes extracted raw author and returns it formatted
|
||||||
|
* as string.
|
||||||
|
* Can be easily overwritten for in case the value needs to be transformed into something
|
||||||
|
* else.
|
||||||
|
* @param string $value
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function formatItemAuthor($value) {
|
||||||
|
return $value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix
|
||||||
|
* timestamp as integer.
|
||||||
|
* Can be easily overwritten for example if a special format has to be expected on the
|
||||||
|
* source website.
|
||||||
|
* @param string $value
|
||||||
|
* @return false|int
|
||||||
|
*/
|
||||||
|
protected function formatItemTimestamp($value) {
|
||||||
|
return strtotime($value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Formats the enclosures of a feed item. Takes extracted raw enclosures and returns them
|
||||||
|
* formatted as array.
|
||||||
|
* Can be easily overwritten for in case the values need to be transformed into something
|
||||||
|
* else.
|
||||||
|
* @param string $value
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
protected function formatItemEnclosures($value) {
|
||||||
|
return array($this->cleanImageUrl($value));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Formats the categories of a feed item. Takes extracted raw categories and returns them
|
||||||
|
* formatted as array.
|
||||||
|
* Can be easily overwritten for in case the values need to be transformed into something
|
||||||
|
* else.
|
||||||
|
* @param string $value
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
protected function formatItemCategories($value) {
|
||||||
|
return array($value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param $imageUrl
|
||||||
|
* @return string|void
|
||||||
|
*/
|
||||||
|
protected function cleanImageUrl($imageUrl)
|
||||||
|
{
|
||||||
|
$result = preg_match('~(?:http(?:s)?:)?[\/a-zA-Z0-9\-_\.]+\.(?:jpg|gif|png|jpeg|ico){1}~', $imageUrl, $matches);
|
||||||
|
if(1 !== $result) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
return urljoin($this->feedUri, $matches[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param $typedResult
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getItemValueOrNodeValue($typedResult)
|
||||||
|
{
|
||||||
|
if($typedResult instanceof DOMNodeList) {
|
||||||
|
$item = $typedResult->item(0);
|
||||||
|
if ($item instanceof DOMElement) {
|
||||||
|
return trim($item->nodeValue);
|
||||||
|
} elseif ($item instanceof DOMAttr) {
|
||||||
|
return trim($item->value);
|
||||||
|
}
|
||||||
|
} elseif(is_string($typedResult) && strlen($typedResult) > 0) {
|
||||||
|
return trim($typedResult);
|
||||||
|
}
|
||||||
|
returnServerError('Unknown type of XPath expression result.');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fixes feed encoding by invoking PHP's utf8_decode function on extracted texts.
|
||||||
|
* Useful in case of "broken" or "weird" characters in the feed where you'd normally
|
||||||
|
* expect umlauts.
|
||||||
|
*
|
||||||
|
* @param $input
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function fixEncoding($input)
|
||||||
|
{
|
||||||
|
return $this->getParam('fix_encoding') ? utf8_decode($input) : $input;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allows overriding default mechanism determining items Uid's
|
||||||
|
*
|
||||||
|
* @param FeedItem $item
|
||||||
|
* @return string|null
|
||||||
|
*/
|
||||||
|
protected function generateItemId(\FeedItem $item) {
|
||||||
|
return null; //auto generation
|
||||||
|
}
|
||||||
|
}
|
|
@ -74,6 +74,7 @@ require_once PATH_LIB . 'BridgeList.php';
|
||||||
require_once PATH_LIB . 'ParameterValidator.php';
|
require_once PATH_LIB . 'ParameterValidator.php';
|
||||||
require_once PATH_LIB . 'ActionFactory.php';
|
require_once PATH_LIB . 'ActionFactory.php';
|
||||||
require_once PATH_LIB . 'ActionAbstract.php';
|
require_once PATH_LIB . 'ActionAbstract.php';
|
||||||
|
require_once PATH_LIB . 'XPathAbstract.php';
|
||||||
|
|
||||||
// Functions
|
// Functions
|
||||||
require_once PATH_LIB . 'html.php';
|
require_once PATH_LIB . 'html.php';
|
||||||
|
|
Loading…
Reference in a new issue