From 85ac9001d6c700658664c0e5cab9f27f80ee93b8 Mon Sep 17 00:00:00 2001 From: LogMANOriginal Date: Tue, 13 Feb 2018 21:46:33 +0100 Subject: [PATCH] [IPBBridge] Add bridge (#564) This bridge returns feeds for any URI that is compatible with the IPB implementation (currently 4.x). Older versions might work, but there is no guarantee. Only forum and topic URIs are supported! The bridge automatically checks if natural feeds are available (by adding '.xml' to the URI). If so the feed is returned. Otherwise the bridge will attempt to identify the content type and build a feed accordingly. Valid URIs are forums and topics. For forums the first page is returned, for topics the last one. Elements are ordered such that the latest entry is returned first (oldest-to-newest) The optional parameter '&limit=' specifies how many pages should be loaded (default: 1). Topics are loaded in reverse order. => Does not work with forums! Images are provided as enclosures and scaled to a max-size of 400x400 pixels by default (Except for natural feeds). The content is filtered before being returned: - Unnecessary tags are removed (iframes, etc...) - Styles for blockquotes are restored (grey background) Closes #507 --- bridges/IPBBridge.php | 307 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 bridges/IPBBridge.php diff --git a/bridges/IPBBridge.php b/bridges/IPBBridge.php new file mode 100644 index 00000000..f3fa14f4 --- /dev/null +++ b/bridges/IPBBridge.php @@ -0,0 +1,307 @@ + array( + 'name' => 'URI', + 'type' => 'text', + 'required' => true, + 'title' => 'Insert forum, subforum or topic URI', + 'exampleValue' => 'https://invisioncommunity.com/forums/forum/499-feedback-and-ideas/' + ), + 'limit' => array( + 'name' => 'Limit', + 'type' => 'number', + 'required' => false, + 'title' => 'Specify how many pages should be fetched (-1: all)', + 'defaultValue' => 1 + ) + ) + ); + const CACHE_TIMEOUT = 3600; + + // Constants for internal use + const FORUM_TYPE_LIST_FILTER = '.cForumTopicTable'; + const FORUM_TYPE_TABLE_FILTER = '#forum_table'; + + const TOPIC_TYPE_ARTICLE = 'article'; + const TOPIC_TYPE_DIV = 'div.post_block'; + + public function getURI(){ + return $this->getInput('uri') ?: parent::getURI(); + } + + public function collectData(){ + // The URI cannot be the mainpage (or anything related) + switch(parse_url($this->getInput('uri'), PHP_URL_PATH)) { + case null: + case '/index.php': + returnClientError('Provided URI is invalid!'); + break; + default: + break; + } + + // Sanitize the URI (because else it won't work) + $uri = rtrim($this->getInput('uri'), '/'); // No trailing slashes! + + // Forums might provide feeds, though that's optional *facepalm* + // Let's check if there is a valid feed available + $headers = get_headers($uri . '.xml'); + + if($headers[0] === 'HTTP/1.1 200 OK') { // Heureka! It's a valid feed! + return $this->collectExpandableDatas($uri); + } + + // No valid feed, so do it the hard way + $html = getSimpleHTMLDOM($uri) + or returnServerError('Could not request ' . $this->getInput('uri') . '!'); + + $limit = $this->getInput('limit'); + + // Determine if this is a topic or a forum + switch(true) { + case $this->isTopic($html): + $this->collectTopic($html, $limit); + break; + case $this->isForum($html); + $this->collectForum($html); + break; + default: + returnClientError('Unknown type!'); + break; + } + } + + private function isForum($html){ + return !is_null($html->find('div[data-controller*=forums.front.forum.forumPage]', 0)) + || !is_null($html->find(static::FORUM_TYPE_TABLE_FILTER, 0)); + } + + private function isTopic($html){ + return !is_null($html->find('div[data-controller*=core.front.core.commentFeed]', 0)) + || !is_null($html->find(static::TOPIC_TYPE_DIV, 0)); + } + + private function collectForum($html){ + // There are multiple forum designs in use (depends on version?) + // 1 - Uses an ordered list (based on https://invisioncommunity.com/forums) + // 2 - Uses a table (based on https://onehallyu.com) + + switch(true) { + case !is_null($html->find(static::FORUM_TYPE_LIST_FILTER, 0)): + $this->collectForumList($html); + break; + case !is_null($html->find(static::FORUM_TYPE_TABLE_FILTER, 0)): + $this->collectForumTable($html); + break; + default: + returnClientError('Unknown forum format!'); + break; + } + } + + private function collectForumList($html){ + foreach($html->find(static::FORUM_TYPE_LIST_FILTER, 0)->children() as $row) { + // Columns: Title, Statistics, Last modified + $item = array(); + + $item['uri'] = $row->find('a', 0)->href; + $item['title'] = $row->find('a', 0)->title; + $item['author'] = $row->find('a', 1)->innertext; + $item['timestamp'] = strtotime($row->find('time', 0)->getAttribute('datetime')); + + $this->items[] = $item; + } + } + + private function collectForumTable($html){ + foreach($html->find(static::FORUM_TYPE_TABLE_FILTER, 0)->children() as $row) { + // Columns: Icon, Content, Preview, Statistics, Last modified + $item = array(); + + // Skip header row + if(!is_null($row->find('th', 0))) continue; + + $item['uri'] = $row->find('a', 0)->href; + $item['title'] = $row->find('.title', 0)->plaintext; + $item['timestamp'] = strtotime($row->find('[itemprop=dateCreated]', 0)->plaintext); + + $this->items[] = $item; + } + } + + private function collectTopic($html, $limit){ + // There are multiple topic designs in use (depends on version?) + // 1 - Uses articles (based on https://invisioncommunity.com/forums) + // 2 - Uses divs (based on https://onehallyu.com) + + switch(true) { + case !is_null($html->find(static::TOPIC_TYPE_ARTICLE, 0)): + $this->collectTopicHistory($html, $limit, 'collectTopicArticle'); + break; + case !is_null($html->find(static::TOPIC_TYPE_DIV, 0)): + $this->collectTopicHistory($html, $limit, 'collectTopicDiv'); + break; + default: + returnClientError('Unknown topic format!'); + break; + } + } + + private function collectTopicHistory($html, $limit, $callback){ + // Make sure the callback is valid! + if(!method_exists($this, $callback)) + returnServerError('Unknown function (\'' . $callback . '\')!'); + + $next = null; // Holds the URI of the next page + + do { + // Skip loading HTML on first iteration + if(!is_null($next)) { + $html = getSimpleHTMLDOMCached($next); + } + + $next = $this->$callback($html, is_null($next)); + $limit--; + } while(!is_null($next) && $limit <> 0); + } + + private function collectTopicArticle($html, $firstrun = true){ + $title = $html->find('h1.ipsType_pageTitle', 0)->plaintext; + + // Are we on last page? + if($firstrun && !is_null($html->find('.ipsPagination', 0))) { + $last = $html->find('.ipsPagination_last a', 0)->{'data-page'}; + $active = $html->find('.ipsPagination_active a', 0)->{'data-page'}; + + if($active !== $last) { + // Load last page into memory (cached) + $html = getSimpleHTMLDOMCached($html->find('.ipsPagination_last a', 0)->href); + } + } + + foreach(array_reverse($html->find(static::TOPIC_TYPE_ARTICLE)) as $article) { + $item = array(); + + $item['uri'] = $article->find('time', 0)->parent()->href; + $item['author'] = $article->find('aside a', 0)->plaintext; + $item['title'] = $item['author'] . ' - ' . $title; + $item['timestamp'] = strtotime($article->find('time', 0)->getAttribute('datetime')); + + $content = $article->find('[data-role=commentContent]', 0); + $content = $this->scaleImages($content); + $item['content'] = $this->fixContent($content); + $item['enclosures'] = $this->findImages($article->find('[data-role=commentContent]', 0)) ?: null; + + $this->items[] = $item; + } + + // Return whatever page comes next (previous, as we add in inverse order) + // Do we have a previous page? (inactive means no) + if(!is_null($html->find('li[class=ipsPagination_prev ipsPagination_inactive]', 0))) { + return null; // No, or no more + } elseif(!is_null($html->find('li[class=ipsPagination_prev]', 0))) { + return $html->find('.ipsPagination_prev a', 0)->href; + } + + return null; + } + + private function collectTopicDiv($html, $firstrun = true){ + $title = $html->find('h1.ipsType_pagetitle', 0)->plaintext; + + // Are we on last page? + if($firstrun && !is_null($html->find('.pagination', 0))) { + + $active = $html->find('li[class=page active]', 0)->plaintext; + + // There are two ways the 'last' page is displayed: + // - With a distict 'last' button (only if there are enough pages) + // - With a button for each page (use last button) + if(!is_null($html->find('li.last', 0))) { + $last = $html->find('li.last a', 0); + } else { + $last = $html->find('li[class=page] a', -1); + } + + if($active !== $last->plaintext) { + // Load last page into memory (cached) + $html = getSimpleHTMLDOMCached($last->href); + } + } + + foreach(array_reverse($html->find(static::TOPIC_TYPE_DIV)) as $article) { + $item = array(); + + $item['uri'] = $article->find('a[rel=bookmark]', 0)->href; + $item['author'] = $article->find('.author', 0)->plaintext; + $item['title'] = $item['author'] . ' - ' . $title; + $item['timestamp'] = strtotime($article->find('.published', 0)->getAttribute('title')); + + $content = $article->find('[itemprop=commentText]', 0); + $content = $this->scaleImages($content); + $item['content'] = $this->fixContent($content); + + $item['enclosures'] = $this->findImages($article->find('.post_body', 0)) ?: null; + + $this->items[] = $item; + } + + // Return whatever page comes next (previous, as we add in inverse order) + // Do we have a previous page? + if(!is_null($html->find('li.prev', 0))) { + return $html->find('li.prev a', 0)->href; + } + + return null; + } + + /** Returns all images from the provide HTML DOM */ + private function findImages($html){ + $images = array(); + + foreach($html->find('img') as $img) { + $images[] = $img->src; + } + + return $images; + } + + /** Sets the maximum width and height for all images */ + private function scaleImages($html, $width = 400, $height = 400){ + foreach($html->find('img') as $img) { + $img->style = "max-width: {$width}px; max-height: {$height}px;"; + } + + return $html; + } + + /** Removes all unnecessary tags and adds formatting */ + private function fixContent($html){ + + // Restore quote highlighting + foreach($html->find('blockquote') as $quote) { + $quote->style = <<innertext, + '



    • ' + ); + + return $content; + } +}