[IPBBridge] Add bridge (#564)

This bridge returns feeds for any URI that is compatible with the
IPB implementation (currently 4.x). Older versions might work, but
there is no guarantee.

Only forum and topic URIs are supported!

The bridge automatically checks if natural feeds are available (by
adding '.xml' to the URI). If so the feed is returned. Otherwise
the bridge will attempt to identify the content type and build a
feed accordingly.

Valid URIs are forums and topics. For forums the first page is
returned, for topics the last one. Elements are ordered such that
the latest entry is returned first (oldest-to-newest)

The optional parameter '&limit=' specifies how many pages should
be loaded (default: 1). Topics are loaded in reverse order.
=> Does not work with forums!

Images are provided as enclosures and scaled to a max-size of
400x400 pixels by default (Except for natural feeds).

The content is filtered before being returned:
- Unnecessary tags are removed (iframes, etc...)
- Styles for blockquotes are restored (grey background)

Closes #507
This commit is contained in:
LogMANOriginal 2018-02-13 21:46:33 +01:00 committed by GitHub
parent 7939bffcdd
commit 85ac9001d6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

307
bridges/IPBBridge.php Normal file
View file

@ -0,0 +1,307 @@
<?php
class IPBBridge extends FeedExpander {
const NAME = 'IPB Bridge';
const URI = 'https://www.invisionpower.com';
const DESCRIPTION = 'Returns feeds for forums powered by IPB';
const MAINTAINER = 'logmanoriginal';
const PARAMETERS = array(
array(
'uri' => array(
'name' => 'URI',
'type' => 'text',
'required' => true,
'title' => 'Insert forum, subforum or topic URI',
'exampleValue' => 'https://invisioncommunity.com/forums/forum/499-feedback-and-ideas/'
),
'limit' => array(
'name' => 'Limit',
'type' => 'number',
'required' => false,
'title' => 'Specify how many pages should be fetched (-1: all)',
'defaultValue' => 1
)
)
);
const CACHE_TIMEOUT = 3600;
// Constants for internal use
const FORUM_TYPE_LIST_FILTER = '.cForumTopicTable';
const FORUM_TYPE_TABLE_FILTER = '#forum_table';
const TOPIC_TYPE_ARTICLE = 'article';
const TOPIC_TYPE_DIV = 'div.post_block';
public function getURI(){
return $this->getInput('uri') ?: parent::getURI();
}
public function collectData(){
// The URI cannot be the mainpage (or anything related)
switch(parse_url($this->getInput('uri'), PHP_URL_PATH)) {
case null:
case '/index.php':
returnClientError('Provided URI is invalid!');
break;
default:
break;
}
// Sanitize the URI (because else it won't work)
$uri = rtrim($this->getInput('uri'), '/'); // No trailing slashes!
// Forums might provide feeds, though that's optional *facepalm*
// Let's check if there is a valid feed available
$headers = get_headers($uri . '.xml');
if($headers[0] === 'HTTP/1.1 200 OK') { // Heureka! It's a valid feed!
return $this->collectExpandableDatas($uri);
}
// No valid feed, so do it the hard way
$html = getSimpleHTMLDOM($uri)
or returnServerError('Could not request ' . $this->getInput('uri') . '!');
$limit = $this->getInput('limit');
// Determine if this is a topic or a forum
switch(true) {
case $this->isTopic($html):
$this->collectTopic($html, $limit);
break;
case $this->isForum($html);
$this->collectForum($html);
break;
default:
returnClientError('Unknown type!');
break;
}
}
private function isForum($html){
return !is_null($html->find('div[data-controller*=forums.front.forum.forumPage]', 0))
|| !is_null($html->find(static::FORUM_TYPE_TABLE_FILTER, 0));
}
private function isTopic($html){
return !is_null($html->find('div[data-controller*=core.front.core.commentFeed]', 0))
|| !is_null($html->find(static::TOPIC_TYPE_DIV, 0));
}
private function collectForum($html){
// There are multiple forum designs in use (depends on version?)
// 1 - Uses an ordered list (based on https://invisioncommunity.com/forums)
// 2 - Uses a table (based on https://onehallyu.com)
switch(true) {
case !is_null($html->find(static::FORUM_TYPE_LIST_FILTER, 0)):
$this->collectForumList($html);
break;
case !is_null($html->find(static::FORUM_TYPE_TABLE_FILTER, 0)):
$this->collectForumTable($html);
break;
default:
returnClientError('Unknown forum format!');
break;
}
}
private function collectForumList($html){
foreach($html->find(static::FORUM_TYPE_LIST_FILTER, 0)->children() as $row) {
// Columns: Title, Statistics, Last modified
$item = array();
$item['uri'] = $row->find('a', 0)->href;
$item['title'] = $row->find('a', 0)->title;
$item['author'] = $row->find('a', 1)->innertext;
$item['timestamp'] = strtotime($row->find('time', 0)->getAttribute('datetime'));
$this->items[] = $item;
}
}
private function collectForumTable($html){
foreach($html->find(static::FORUM_TYPE_TABLE_FILTER, 0)->children() as $row) {
// Columns: Icon, Content, Preview, Statistics, Last modified
$item = array();
// Skip header row
if(!is_null($row->find('th', 0))) continue;
$item['uri'] = $row->find('a', 0)->href;
$item['title'] = $row->find('.title', 0)->plaintext;
$item['timestamp'] = strtotime($row->find('[itemprop=dateCreated]', 0)->plaintext);
$this->items[] = $item;
}
}
private function collectTopic($html, $limit){
// There are multiple topic designs in use (depends on version?)
// 1 - Uses articles (based on https://invisioncommunity.com/forums)
// 2 - Uses divs (based on https://onehallyu.com)
switch(true) {
case !is_null($html->find(static::TOPIC_TYPE_ARTICLE, 0)):
$this->collectTopicHistory($html, $limit, 'collectTopicArticle');
break;
case !is_null($html->find(static::TOPIC_TYPE_DIV, 0)):
$this->collectTopicHistory($html, $limit, 'collectTopicDiv');
break;
default:
returnClientError('Unknown topic format!');
break;
}
}
private function collectTopicHistory($html, $limit, $callback){
// Make sure the callback is valid!
if(!method_exists($this, $callback))
returnServerError('Unknown function (\'' . $callback . '\')!');
$next = null; // Holds the URI of the next page
do {
// Skip loading HTML on first iteration
if(!is_null($next)) {
$html = getSimpleHTMLDOMCached($next);
}
$next = $this->$callback($html, is_null($next));
$limit--;
} while(!is_null($next) && $limit <> 0);
}
private function collectTopicArticle($html, $firstrun = true){
$title = $html->find('h1.ipsType_pageTitle', 0)->plaintext;
// Are we on last page?
if($firstrun && !is_null($html->find('.ipsPagination', 0))) {
$last = $html->find('.ipsPagination_last a', 0)->{'data-page'};
$active = $html->find('.ipsPagination_active a', 0)->{'data-page'};
if($active !== $last) {
// Load last page into memory (cached)
$html = getSimpleHTMLDOMCached($html->find('.ipsPagination_last a', 0)->href);
}
}
foreach(array_reverse($html->find(static::TOPIC_TYPE_ARTICLE)) as $article) {
$item = array();
$item['uri'] = $article->find('time', 0)->parent()->href;
$item['author'] = $article->find('aside a', 0)->plaintext;
$item['title'] = $item['author'] . ' - ' . $title;
$item['timestamp'] = strtotime($article->find('time', 0)->getAttribute('datetime'));
$content = $article->find('[data-role=commentContent]', 0);
$content = $this->scaleImages($content);
$item['content'] = $this->fixContent($content);
$item['enclosures'] = $this->findImages($article->find('[data-role=commentContent]', 0)) ?: null;
$this->items[] = $item;
}
// Return whatever page comes next (previous, as we add in inverse order)
// Do we have a previous page? (inactive means no)
if(!is_null($html->find('li[class=ipsPagination_prev ipsPagination_inactive]', 0))) {
return null; // No, or no more
} elseif(!is_null($html->find('li[class=ipsPagination_prev]', 0))) {
return $html->find('.ipsPagination_prev a', 0)->href;
}
return null;
}
private function collectTopicDiv($html, $firstrun = true){
$title = $html->find('h1.ipsType_pagetitle', 0)->plaintext;
// Are we on last page?
if($firstrun && !is_null($html->find('.pagination', 0))) {
$active = $html->find('li[class=page active]', 0)->plaintext;
// There are two ways the 'last' page is displayed:
// - With a distict 'last' button (only if there are enough pages)
// - With a button for each page (use last button)
if(!is_null($html->find('li.last', 0))) {
$last = $html->find('li.last a', 0);
} else {
$last = $html->find('li[class=page] a', -1);
}
if($active !== $last->plaintext) {
// Load last page into memory (cached)
$html = getSimpleHTMLDOMCached($last->href);
}
}
foreach(array_reverse($html->find(static::TOPIC_TYPE_DIV)) as $article) {
$item = array();
$item['uri'] = $article->find('a[rel=bookmark]', 0)->href;
$item['author'] = $article->find('.author', 0)->plaintext;
$item['title'] = $item['author'] . ' - ' . $title;
$item['timestamp'] = strtotime($article->find('.published', 0)->getAttribute('title'));
$content = $article->find('[itemprop=commentText]', 0);
$content = $this->scaleImages($content);
$item['content'] = $this->fixContent($content);
$item['enclosures'] = $this->findImages($article->find('.post_body', 0)) ?: null;
$this->items[] = $item;
}
// Return whatever page comes next (previous, as we add in inverse order)
// Do we have a previous page?
if(!is_null($html->find('li.prev', 0))) {
return $html->find('li.prev a', 0)->href;
}
return null;
}
/** Returns all images from the provide HTML DOM */
private function findImages($html){
$images = array();
foreach($html->find('img') as $img) {
$images[] = $img->src;
}
return $images;
}
/** Sets the maximum width and height for all images */
private function scaleImages($html, $width = 400, $height = 400){
foreach($html->find('img') as $img) {
$img->style = "max-width: {$width}px; max-height: {$height}px;";
}
return $html;
}
/** Removes all unnecessary tags and adds formatting */
private function fixContent($html){
// Restore quote highlighting
foreach($html->find('blockquote') as $quote) {
$quote->style = <<<EOD
padding: 0px 15px;
border-width: 1px 1px 1px 2px;
border-style: solid;
border-color: #ededed #e8e8e8 #dbdbdb #666666;
background: #fbfbfb;
EOD;
}
// Remove unnecessary tags
$content = strip_tags(
$html->innertext,
'<p><a><img><ol><ul><li><table><tr><th><td><strong><blockquote><br><hr><h>'
);
return $content;
}
}