diff --git a/bridges/MoinMoinBridge.php b/bridges/MoinMoinBridge.php new file mode 100644 index 00000000..d57f1b0b --- /dev/null +++ b/bridges/MoinMoinBridge.php @@ -0,0 +1,327 @@ + array( + 'name' => 'Source', + 'type' => 'text', + 'required' => true, + 'title' => 'Insert wiki page URI (e.g.: https://moinmo.in/MoinMoin)', + 'exampleValue' => 'https://moinmo.in/MoinMoin' + ), + 'separator' => array( + 'name' => 'Separator', + 'type' => 'list', + 'requied' => true, + 'title' => 'Defines the separtor for splitting content into feeds', + 'defaultValue' => 'h2', + 'values' => array( + 'Header (h1)' => 'h1', + 'Header (h2)' => 'h2', + 'Header (h3)' => 'h3', + 'List element (li)' => 'li', + 'Anchor (a)' => 'a' + ) + ), + 'limit' => array( + 'name' => 'Limit', + 'type' => 'number', + 'required' => false, + 'title' => 'Number of items to return (from top)', + 'defaultValue' => -1 + ), + 'content' => array( + 'name' => 'Content', + 'type' => 'list', + 'required' => false, + 'title' => 'Defines how feed contents are build', + 'defaultValue' => 'separator', + 'values' => array( + 'By separator' => 'separator', + 'Follow link (only for anchor)' => 'follow', + 'None' => 'none' + ) + ) + ) + ); + + private $title = ''; + + public function collectData(){ + /* MoinMoin uses a rather unpleasent representation of HTML. Instead of + * using tags like
, ,
, etc... it uses + *
, and

. Also each line is literaly identified via + * IDs. The only way to distinguish content is via headers, though not + * in all cases. + * + * Example (indented for the sake of readability): + * ... + * + * + * + * + * + * + * + * + * + *

MoinMoin is a Wiki software implemented in + * Python + * and distributed as Free Software under + * GNU GPL license. + * ... + */ + $html = getSimpleHTMLDOM($this->getInput('source')) + or returnServerError('Could not load ' . $this->getInput('source')); + + // Some anchors link to local sites or local IDs (both don't work well + // in feeds) + $html = $this->fixAnchors($html); + + $this->title = $html->find('title', 0)->innertext . ' | ' . self::NAME; + + // Here we focus on simple author and timestamp information from the given + // page. Later we update this information in case the anchor is followed. + $author = $this->findAuthor($html); + $timestamp = $this->findTimestamp($html); + + $sections = $this->splitSections($html); + + foreach($sections as $section){ + $item = array(); + + $item['uri'] = $this->findSectionAnchor($section[0]); + + switch($this->getInput('content')){ + case 'none': // Do not return any content + break; + case 'follow': // Follow the anchor + // We can only follow anchors (use default otherwise) + if($this->getInput('separator') === 'a'){ + $content = $this->followAnchor($item['uri']); + + // Return only actual content + $item['content'] = $content->find('div#page', 0)->innertext; + + // Each page could have its own author and timestamp + $author = $this->findAuthor($content); + $timestamp = $this->findTimestamp($content); + + break; + } + case 'separator': + default: // Use contents from the current page + $item['content'] = $this->cleanArticle($section[2]); + } + + if(!is_null($author)) $item['author'] = $author; + if(!is_null($timestamp)) $item['timestamp'] = $timestamp; + $item['title'] = strip_tags($section[1]); + + // Skip items with empty title + if(empty(trim($item['title']))){ + continue; + } + + $this->items[] = $item; + + if($this->getInput('limit') > 0 + && count($this->items) >= $this->getInput('limit')){ + break; + } + } + } + + public function getName(){ + return $this->title ?: parent::getName(); + } + + public function getURI(){ + return $this->getInput('source') ?: parent::getURI(); + } + + /** + * Splits the html into sections. + * + * Returns an array with one element per section. Each element consists of: + * [0] The entire section + * [1] The section title + * [2] The section content + */ + private function splitSections($html){ + $content = $html->find('div#page', 0)->innertext + or returnServerError('Unable to find

!'); + + $sections = array(); + + $regex = implode( + '', + array( + "\<{$this->getInput('separator')}.+?(?=\>)\>", + "(.+?)(?=\<\/{$this->getInput('separator')}\>)", + "\<\/{$this->getInput('separator')}\>", + "(.+?)((?=\<{$this->getInput('separator')})|(?=\find('title', 0)->innertext, + $content + ) + ); + } + + return $sections; + } + + /** + * Returns the anchor for a given section + */ + private function findSectionAnchor($section){ + $html = str_get_html($section); + + // For IDs + $anchor = $html->find($this->getInput('separator') . '[id=]', 0); + if(!is_null($anchor)){ + return $this->getInput('source') . '#' . $anchor->id; + } + + // For actual anchors + $anchor = $html->find($this->getInput('separator') . '[href=]', 0); + if(!is_null($anchor)){ + return $anchor->href; + } + + // Nothing found + return $this->getInput('source'); + } + + /** + * Returns the author + * + * Notice: Some pages don't provide author information + */ + private function findAuthor($html){ + /* Example: + *

MoinMoin: LocalSpellingWords + * (last edited 2017-02-16 15:36:31 by hosted-by)

+ */ + $pageinfo = $html->find('[id="pageinfo"]', 0); + + if(is_null($pageinfo)){ + return null; + } else { + $author = $pageinfo->find('[title=]', 0); + if(is_null($author)){ + return null; + } else { + return trim(explode('@', $author->title)[0]); + } + } + } + + /** + * Returns the time of last edit + * + * Notice: Some pages don't provide this information + */ + private function findTimestamp($html){ + // See example of findAuthor() + $pageinfo = $html->find('[id="pageinfo"]', 0); + + if(is_null($pageinfo)){ + return null; + } else { + $timestamp = $pageinfo->innertext; + $matches = array(); + preg_match('/.+?(?=\().+?(?=\d)([0-9\-\s\:]+)/m', $pageinfo, $matches); + return strtotime($matches[1]); + } + } + + /** + * Returns the original HTML with all anchors fixed (makes relative anchors + * absolute) + */ + private function fixAnchors($html, $source = null){ + + $source = $source ?: $this->getURI(); + + foreach($html->find('a') as $anchor){ + switch(substr($anchor->href, 0, 1)){ + case 'h': // http or https, no actions required + break; + case '/': // some relative path + $anchor->href = $this->findDomain($source) . $anchor->href; + break; + case '#': // it's an ID + default: // probably something like ? or &, skip empty ones + if(!isset($anchor->href)) + break; + $anchor->href = $source . $anchor->href; + } + } + + return $html; + } + + /** + * Loads the full article of a given anchor (if the anchor is from the same + * wiki domain) + */ + private function followAnchor($anchor){ + if(strrpos($anchor, $this->findDomain($this->getInput('source')) === false)){ + return null; + } + + $html = getSimpleHTMLDOMCached($anchor); + if(!$html){ // Cannot load article + return null; + } + + return $this->fixAnchors($html, $anchor); + } + + /** + * Finds the domain for a given URI + */ + private function findDomain($uri){ + $matches = array(); + preg_match('/(http[s]{0,1}:\/\/.+?(?=\/))/', $uri, $matches); + return $matches[1]; + } + + /* This function is a copy from CNETBridge */ + private function stripWithDelimiters($string, $start, $end){ + while(strpos($string, $start) !== false){ + $section_to_remove = substr($string, strpos($string, $start)); + $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + $string = str_replace($section_to_remove, '', $string); + } + + return $string; + } + + /* This function is based on CNETBridge */ + private function cleanArticle($article_html){ + $article_html = $this->stripWithDelimiters($article_html, ''); + return $article_html; + } +}