[XenForoBridge] Add new bridge
Adds a bridge for forums powered by XenForo (see https://xenforo.com). Support between forums may vary due to ever changing versions with no clear distinction. Especially timestamps may not work depending on the supported language (should currently work on en-US and de-DE). Tested on - https://xenforo.com/community/ - http://www.ign.com/boards/ Notice: XenForo provides RSS feeds for forums (but not specific topics). For example: https://xenforo.com/community/forums/-/index.rss
This commit is contained in:
parent
5fe943562a
commit
65da157fff
1 changed files with 464 additions and 0 deletions
464
bridges/XenForoBridge.php
Normal file
464
bridges/XenForoBridge.php
Normal file
|
@ -0,0 +1,464 @@
|
|||
<?php
|
||||
/**
|
||||
* This bridge generates feeds for threads from forums running XenForo version 2
|
||||
*
|
||||
* Examples:
|
||||
* - https://xenforo.com/community/
|
||||
* - http://www.ign.com/boards/
|
||||
*
|
||||
* Notice: XenForo does provide RSS feeds for forums. For example:
|
||||
* - https://xenforo.com/community/forums/-/index.rss
|
||||
*
|
||||
* For more information on XenForo, visit
|
||||
* - https://xenforo.com/
|
||||
* - https://en.wikipedia.org/wiki/XenForo
|
||||
*/
|
||||
class XenForoBridge extends BridgeAbstract {
|
||||
|
||||
// Bridge specific constants
|
||||
const CONTEXT_THREAD = 'Thread';
|
||||
const XENFORO_VERSION_1 = '1.0';
|
||||
const XENFORO_VERSION_2 = '2.0';
|
||||
|
||||
// RSS-Bridge constants
|
||||
const NAME = 'XenForo Bridge';
|
||||
const URI = 'https://xenforo.com/';
|
||||
const DESCRIPTION = 'Generates feeds for threads in forums powered by XenForo';
|
||||
const MAINTAINER = 'logmanoriginal';
|
||||
const PARAMETERS = array(
|
||||
self::CONTEXT_THREAD => array(
|
||||
'url' => array(
|
||||
'name' => 'Thread URL',
|
||||
'type' => 'text',
|
||||
'required' => true,
|
||||
'title' => 'Insert URL to the thread for which the feed should be generated',
|
||||
'exampleValue' => 'https://xenforo.com/community/threads/guide-to-suggestions.2285/'
|
||||
)
|
||||
),
|
||||
'global' => array(
|
||||
'limit' => array(
|
||||
'name' => 'Limit',
|
||||
'type' => 'number',
|
||||
'required' => false,
|
||||
'title' => 'Specify maximum number of elements to return in the feed',
|
||||
'defaultValue' => 10
|
||||
)
|
||||
)
|
||||
);
|
||||
const CACHE_TIMEOUT = 7200; // 10 minutes
|
||||
|
||||
private $title = '';
|
||||
private $threadurl = '';
|
||||
private $version; // Holds the XenForo version
|
||||
|
||||
public function getName() {
|
||||
|
||||
switch($this->queriedContext) {
|
||||
case self::CONTEXT_THREAD: return $this->title . ' - ' . static::NAME;
|
||||
}
|
||||
|
||||
return parent::getName();
|
||||
|
||||
}
|
||||
|
||||
public function getURI() {
|
||||
|
||||
switch($this->queriedContext) {
|
||||
case self::CONTEXT_THREAD: return $this->threadurl;
|
||||
}
|
||||
|
||||
return parent::getURI();
|
||||
|
||||
}
|
||||
|
||||
public function collectData() {
|
||||
|
||||
$this->threadurl = filter_var(
|
||||
$this->getInput('url'),
|
||||
FILTER_VALIDATE_URL,
|
||||
FILTER_FLAG_SCHEME_REQUIRED |
|
||||
FILTER_FLAG_HOST_REQUIRED |
|
||||
FILTER_FLAG_PATH_REQUIRED);
|
||||
|
||||
if($this->threadurl === false) {
|
||||
returnClientError('The URL you provided is invalid!');
|
||||
}
|
||||
|
||||
$urlparts = parse_url($this->threadurl, PHP_URL_SCHEME);
|
||||
|
||||
// Scheme must be "http" or "https"
|
||||
if(preg_match('/http[s]{0,1}/', parse_url($this->threadurl, PHP_URL_SCHEME)) == false) {
|
||||
returnClientError('The URL you provided doesn\'t specify a valid scheme (http or https)!');
|
||||
}
|
||||
|
||||
// Path cannot be root (../)
|
||||
if(parse_url($this->threadurl, PHP_URL_PATH) === '/') {
|
||||
returnClientError('The URL you provided doesn\'t link to a valid thread (root path)!');
|
||||
}
|
||||
|
||||
// XenForo adds a thread ID to the URL, like "...-thread.454934283". It must be present
|
||||
if(preg_match('/.+\.\d+[\/]{0,1}/', parse_URL($this->threadurl, PHP_URL_PATH)) == false) {
|
||||
returnClientError('The URL you provided doesn\'t link to a valid thread (ID missing)!');
|
||||
}
|
||||
|
||||
// We want to start at the first page in the thread. XenForo uses "../page-n" syntax
|
||||
// to identify pages (except for the first page).
|
||||
// Notice: XenForo uses the concept of "sentinels" to find and replace parts in the
|
||||
// URL. Technically forum hosts can change the syntax!
|
||||
if(preg_match('/.+\/(page-\d+.*)$/', $this->threadurl, $matches) != false) {
|
||||
|
||||
// before: https://xenforo.com/community/threads/guide-to-suggestions.2285/page-5
|
||||
// after : https://xenforo.com/community/threads/guide-to-suggestions.2285/
|
||||
$this->threadurl = str_replace($matches[1], '', $this->threadurl);
|
||||
|
||||
}
|
||||
|
||||
$html = getSimpleHTMLDOMCached($this->threadurl)
|
||||
or returnServerError('Failed loading data from "' . $this->threadurl . '"!');
|
||||
|
||||
$html = defaultLinkTo($html, $this->threadurl);
|
||||
|
||||
// Notice: The DOM structure changes depending on the XenForo version used
|
||||
if($mainContent = $html->find('div.mainContent', 0)) {
|
||||
$this->version = self::XENFORO_VERSION_1;
|
||||
} elseif ($mainContent = $html->find('div[class="p-body"]', 0)) {
|
||||
$this->version = self::XENFORO_VERSION_2;
|
||||
} else {
|
||||
returnServerError('This forum is currently not supported!');
|
||||
}
|
||||
|
||||
switch($this->version) {
|
||||
case self::XENFORO_VERSION_1:
|
||||
|
||||
$titleBar = $mainContent->find('div.titleBar h1', 0)
|
||||
or returnServerError('Error finding title bar!');
|
||||
|
||||
$this->title = $titleBar->plaintext;
|
||||
|
||||
// Store items from current page (we'll use $this->items as LIFO buffer)
|
||||
$this->extractThreadPostsV1($html, $this->threadurl);
|
||||
$this->extractPagesV1($html);
|
||||
|
||||
break;
|
||||
|
||||
case self::XENFORO_VERSION_2:
|
||||
|
||||
$titleBar = $mainContent->find('div[class="p-title"] h1', 0)
|
||||
or returnServerError('Error finding title bar!');
|
||||
|
||||
$this->title = $titleBar->plaintext;
|
||||
$this->extractThreadPostsV2($html, $this->threadurl);
|
||||
$this->extractPagesV2($html);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
while(count($this->items) > $this->getInput('limit')) {
|
||||
array_shift($this->items);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts thread posts
|
||||
* @param $html A simplehtmldom object
|
||||
* @param $url The url from which $html was loaded
|
||||
*/
|
||||
private function extractThreadPostsV1($html, $url) {
|
||||
|
||||
$lang = $html->find('html', 0)->lang;
|
||||
|
||||
// Posts are contained in an "ol"
|
||||
$messageList = $html->find('#messageList li')
|
||||
or returnServerError('Error finding message list!');
|
||||
|
||||
foreach($messageList as $post) {
|
||||
|
||||
if(!isset($post->attr['id'])) { // Skip ads
|
||||
continue;
|
||||
}
|
||||
|
||||
$item = array();
|
||||
|
||||
$item['uri'] = $url . '#' . $post->getAttribute('id');
|
||||
|
||||
$content = $post->find('.messageContent article', 0);
|
||||
|
||||
// Add some style to quotes
|
||||
foreach($content->find('.bbCodeQuote') as $quote) {
|
||||
$quote->style = '
|
||||
color: #495566;
|
||||
background-color: rgb(248,251,253);
|
||||
border: 1px solid rgb(111, 140, 180);
|
||||
border-color: rgb(111, 140, 180);
|
||||
font-style: italic;';
|
||||
}
|
||||
|
||||
// Remove script tags
|
||||
foreach($content->find('script') as $script) {
|
||||
$script->outertext = '';
|
||||
}
|
||||
|
||||
$item['content'] = $content->innertext;
|
||||
|
||||
// Remove quotes (for the title)
|
||||
foreach($content->find('.bbCodeQuote') as $quote) {
|
||||
$quote->innertext = '';
|
||||
}
|
||||
|
||||
$title = trim($content->plaintext);
|
||||
|
||||
if(strlen($title) > 70) {
|
||||
$item['title'] = substr($title, 0, strpos($title, ' ', 70)) . '...';
|
||||
} else {
|
||||
$item['title'] = $title;
|
||||
}
|
||||
|
||||
/**
|
||||
* Timestamps are presented in two forms:
|
||||
*
|
||||
* 1) short version (for older posts?)
|
||||
* <span
|
||||
* class="DateTime"
|
||||
* title="22 Oct. 2018 at 23:47"
|
||||
* >22 Oct. 2018</span>
|
||||
*
|
||||
* This form has to be interpreted depending on the current language.
|
||||
*
|
||||
* 2) long version (for newer posts?)
|
||||
* <abbr
|
||||
* class="DateTime"
|
||||
* data-time="1541008785"
|
||||
* data-diff="310694"
|
||||
* data-datestring="31 Oct. 2018"
|
||||
* data-timestring="18:59"
|
||||
* title="31 Oct. 2018 at 18:59"
|
||||
* >Wednesday at 18:59</abbr>
|
||||
*
|
||||
* This form has the timestamp embedded (data-time)
|
||||
*/
|
||||
if($timestamp = $post->find('abbr.DateTime', 0)) { // long version (preffered)
|
||||
$item['timestamp'] = $timestamp->{'data-time'};
|
||||
} elseif($timestamp = $post->find('span.DateTime', 0)) { // short version
|
||||
$item['timestamp'] = $this->fixDate($timestamp->title, $lang);
|
||||
}
|
||||
|
||||
$item['author'] = $post->getAttribute('data-author');
|
||||
|
||||
// Bridge specific properties
|
||||
$item['id'] = $post->getAttribute('id');
|
||||
|
||||
$this->items[] = $item;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private function extractThreadPostsV2($html, $url) {
|
||||
|
||||
$lang = $html->find('html', 0)->lang;
|
||||
|
||||
$messageList = $html->find('div[class="block-body"] article')
|
||||
or returnServerError('Error finding message list!');
|
||||
|
||||
foreach($messageList as $post) {
|
||||
|
||||
if(!isset($post->attr['id'])) { // Skip ads
|
||||
continue;
|
||||
}
|
||||
|
||||
$item = array();
|
||||
|
||||
$item['uri'] = $url . '#' . $post->getAttribute('id');
|
||||
|
||||
$title = $post->find('div[class="message-content"] article', 0)->plaintext;
|
||||
$end = strpos($title, ' ', 70);
|
||||
$item['title'] = substr($title, 0, $end);
|
||||
|
||||
$item['timestamp'] = $this->fixDate($post->find('time', 0)->title, $lang);
|
||||
$item['author'] = $post->getAttribute('data-author');
|
||||
$item['content'] = $post->find('div[class="message-content"] article', 0);
|
||||
|
||||
// Bridge specific properties
|
||||
$item['id'] = $post->getAttribute('id');
|
||||
|
||||
$this->items[] = $item;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private function extractPagesV1($html) {
|
||||
|
||||
// A navigation bar becomes available if the number of posts grows too
|
||||
// high. When this happens we need to load further pages (from last backwards)
|
||||
if(($pageNav = $html->find('div.PageNav', 0)) !== false) {
|
||||
|
||||
$lastpage = $pageNav->{'data-last'};
|
||||
$baseurl = $pageNav->{'data-baseurl'};
|
||||
$sentinel = $pageNav->{'data-sentinel'};
|
||||
|
||||
$hosturl = parse_url($this->threadurl, PHP_URL_SCHEME)
|
||||
. '://'
|
||||
. parse_url($this->threadurl, PHP_URL_HOST)
|
||||
. '/';
|
||||
|
||||
$page = $lastpage;
|
||||
|
||||
// Load at least the last page
|
||||
do {
|
||||
|
||||
$pageurl = $hosturl . str_replace($sentinel, $lastpage, $baseurl);
|
||||
|
||||
// We can optimize performance by caching all but the last page
|
||||
if($page != $lastpage) {
|
||||
$html = getSimpleHTMLDOMCached($pageurl)
|
||||
or returnServerError('Error loading contents from ' . $pageurl . '!');
|
||||
} else {
|
||||
$html = getSimpleHTMLDOM($pageurl)
|
||||
or returnServerError('Error loading contents from ' . $pageurl . '!');
|
||||
}
|
||||
|
||||
$html = defaultLinkTo($html, $hosturl);
|
||||
|
||||
$this->extractThreadPostsV1($html, $pageurl);
|
||||
|
||||
$page--;
|
||||
|
||||
} while (count($this->items) < $this->getInput('limit') && $page != 1);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private function extractPagesV2($html) {
|
||||
|
||||
// A navigation bar becomes available if the number of posts grows too
|
||||
// high. When this happens we need to load further pages (from last backwards)
|
||||
if(($pageNav = $html->find('div.pageNav', 0)) !== false) {
|
||||
|
||||
foreach($pageNav->find('li') as $nav) {
|
||||
$lastpage = $nav->plaintext;
|
||||
}
|
||||
|
||||
// Manually extract baseurl and inject sentinel
|
||||
$baseurl = $pageNav->find('li a', -1)->href;
|
||||
$baseurl = str_replace('page-' . $lastpage, 'page-{{sentinel}}', $baseurl);
|
||||
|
||||
$sentinel = '{{sentinel}}';
|
||||
|
||||
$hosturl = parse_url($this->threadurl, PHP_URL_SCHEME)
|
||||
. '://'
|
||||
. parse_url($this->threadurl, PHP_URL_HOST);
|
||||
|
||||
$page = $lastpage;
|
||||
|
||||
// Load at least the last page
|
||||
do {
|
||||
|
||||
$pageurl = $hosturl . str_replace($sentinel, $lastpage, $baseurl);
|
||||
|
||||
// We can optimize performance by caching all but the last page
|
||||
if($page != $lastpage) {
|
||||
$html = getSimpleHTMLDOMCached($pageurl)
|
||||
or returnServerError('Error loading contents from ' . $pageurl . '!');
|
||||
} else {
|
||||
$html = getSimpleHTMLDOM($pageurl)
|
||||
or returnServerError('Error loading contents from ' . $pageurl . '!');
|
||||
}
|
||||
|
||||
$html = defaultLinkTo($html, $this->hosturl);
|
||||
|
||||
$this->extractThreadPostsV2($html, $this->pageurl);
|
||||
|
||||
$page--;
|
||||
|
||||
} while (count($this->items) < $this->getInput('limit') && $page != 1);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Fixes dates depending on the choosen language:
|
||||
*
|
||||
* de : dd.mm.yy
|
||||
* en : dd.mm.yy
|
||||
* it : dd/mm/yy
|
||||
*
|
||||
* Basically strtotime doesn't convert dates correctly due to formats
|
||||
* being hard to interpret. So we use the DateTime object.
|
||||
*
|
||||
* We don't know the timezone, so just assume +00:00 (or whatever
|
||||
* DateTime chooses)
|
||||
*/
|
||||
private function fixDate($date, $lang = 'en-US') {
|
||||
|
||||
$mnamesen = [
|
||||
'January',
|
||||
'Feburary',
|
||||
'March',
|
||||
'April',
|
||||
'May',
|
||||
'June',
|
||||
'July',
|
||||
'August',
|
||||
'September',
|
||||
'October',
|
||||
'November',
|
||||
'December'
|
||||
];
|
||||
|
||||
switch($lang) {
|
||||
case 'en-US': // example: Jun 9, 2018 at 11:46 PM
|
||||
|
||||
$df = date_create_from_format('M d, Y \a\t H:i A', $date);
|
||||
break;
|
||||
|
||||
case 'de-DE': // example: 19 Juli 2018 um 19:27 Uhr
|
||||
|
||||
$mnamesde = [
|
||||
'Januar',
|
||||
'Februar',
|
||||
'März',
|
||||
'April',
|
||||
'Mai',
|
||||
'Juni',
|
||||
'Juli',
|
||||
'August',
|
||||
'September',
|
||||
'Oktober',
|
||||
'November',
|
||||
'Dezember'
|
||||
];
|
||||
|
||||
$mnamesdeshort = [
|
||||
'Jan.',
|
||||
'Feb.',
|
||||
'Mär.',
|
||||
'Apr.',
|
||||
'Mai',
|
||||
'Juni',
|
||||
'Juli',
|
||||
'Aug.',
|
||||
'Sep.',
|
||||
'Okt.',
|
||||
'Nov.',
|
||||
'Dez.'
|
||||
];
|
||||
|
||||
$date = str_ireplace($mnamesde, $mnamesen, $date);
|
||||
$date = str_ireplace($mnamesdeshort, $mnamesen, $date);
|
||||
|
||||
$df = date_create_from_format('d M Y \u\m H:i \U\h\r', $date);
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
// debugMessage(date_format($df, 'U'));
|
||||
|
||||
return date_format($df, 'U');
|
||||
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in a new issue