diff --git a/bridges/LeMotDuJourBridge.php b/bridges/LeMotDuJourBridge.php deleted file mode 100644 index d1215941..00000000 --- a/bridges/LeMotDuJourBridge.php +++ /dev/null @@ -1,55 +0,0 @@ -maintainer = "qwertygc"; - $this->name = "LeMotDuJour Bridge"; - $this->uri = "http://www.lemotdujour.com/"; - $this->description = "Returns the newest articles."; - $this->update = "2014-05-25"; - - } - - public function collectData(array $param){ - - function StripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } - function ExtractContent($url) { - $html2 = $this->file_get_html($url); - $text = $html2->find('div.single-contenu', 0)->innertext; - return $text; - } - $html = $this->file_get_html('http://feeds2.feedburner.com/lemotdujour/lemotdujour') or $this->returnError('Could not request LeMotDuJour.', 404); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 10) { - $item = new \Item(); - $item->title = StripCDATA($element->find('title', 0)->innertext); - $item->uri = StripCDATA($element->find('guid', 0)->plaintext); - $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); - $item->content = ExtractContent($item->uri); - $this->items[] = $item; - $limit++; - } - } - - } - - public function getName(){ - return 'LeMotDuJour Bridge'; - } - - public function getURI(){ - return 'http://lemotdujour.com/'; - } - - public function getCacheDuration(){ - return 3600*2; // 2 hours - // return 0; // 2 hours - } -} diff --git a/bridges/RaymondBridge.php b/bridges/RaymondBridge.php deleted file mode 100644 index 9e96a54f..00000000 --- a/bridges/RaymondBridge.php +++ /dev/null @@ -1,53 +0,0 @@ -maintainer = "pit-fgfjiudghdf"; - $this->name = "Raymond"; - $this->uri = "http://www.raymond.cc"; - $this->description = "Returns the 3 newest posts from Raymond.cc (full text)"; - $this->update = "2014-05-26"; - - } - - public function collectData(array $param){ - function raymondStripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } - function raymondExtractContent($url) { - $html2 = $this->file_get_html($url); - $text = $html2->find('div.entry-content', 0)->innertext; - $text = preg_replace('/class="ad".*/', '', $text); - $text = strip_tags($text, '

'); - $text = str_replace('(adsbygoogle = window.adsbygoogle || []).push({});', '', $text); - return $text; - } - $html = $this->file_get_html('http://www.raymond.cc/blog/feed') or $this->returnError('Could not request raymond.', 404); - $limit = 0; - foreach($html->find('item') as $element) { - if($limit < 3) { - $item = new \Item(); - $item->title = raymondStripCDATA($element->find('title', 0)->innertext); - $item->uri = raymondStripCDATA($element->find('guid', 0)->plaintext); - $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); - $item->content = raymondExtractContent($item->uri); - $this->items[] = $item; - $limit++; - } - } - - } - public function getName(){ - return 'raymond'; - } - public function getURI(){ - return 'http://www.raymond.cc/blog'; - } - public function getCacheDuration(){ - return 3600*12; // 12 hour - } -} - diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 7a1b12e3..35ef3bfd 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -1,7 +1,10 @@ name = "Wordpress Bridge"; $this->uri = "https://wordpress.org/"; $this->description = "Returns the 3 newest full posts of a Wordpress blog"; - $this->update = "2016-08-02"; + $this->update = "2016-08-04"; $this->parameters[] = '[ @@ -19,60 +22,118 @@ class WordPressBridge extends BridgeAbstract { "identifier" : "url" } ]'; + } + // Returns the content type for a given html dom + function DetectContentType($html){ + if($html->find('entry')) + return WORDPRESS_TYPE_ATOM; + if($html->find('item')) + return WORDPRESS_TYPE_RSS; + return WORDPRESS_TYPE_ATOM; // Make ATOM default + } + + // Replaces all 'link' tags with 'url' for simplehtmldom to actually find 'links' ('url') + function ReplaceLinkTagsWithUrlTags($element){ + // We need to fix the 'link' tag as simplehtmldom cannot parse it (just rename it and load back as dom) + $element_text = $element->outertext; + $element_text = str_replace('', '', $element_text); + $element_text = str_replace('', '', $element_text); + $element_text = str_replace('', '', $string); + return $string; + } + + function ClearContent($content) { + $content = preg_replace('/]*>[^<]*<\/script>/', '', $content); + $content = preg_replace('/

/', '', $content); + return $content; } public function collectData(array $param) { - - function StripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } - - function clearContent($content) { - $content = preg_replace('//', '', $content); - $content = preg_replace('/
processParams($param); if (!$this->hasUrl()) { $this->returnError('You must specify a URL', 400); } - $this->url = $this->url.'/feed/atom'; + $this->url = $this->url.'/feed/atom'; $html = $this->file_get_html($this->url) or $this->returnError("Could not request {$this->url}.", 404); - $posts = $html->find('entry'); - if(!empty($posts) ) { - $this->name = $html->find('title', 0)->plaintext; - $i=0; - foreach ($html->find('entry') as $article) { + + // Notice: We requested an ATOM feed, however some sites return RSS feeds instead! + $type = $this->DetectContentType($html); + + if($type === WORDPRESS_TYPE_RSS) + $posts = $html->find('item'); + else + $posts = $html->find('entry'); + + if(!empty($posts) ) { + $this->sitename = $html->find('title', 0)->plaintext; + $i=0; + + foreach ($posts as $article) { if($i < 3) { - $this->items[$i]->uri = $article->find('link', 0)->getAttribute('href'); - $this->items[$i]->title = StripCDATA($article->find('title', 0)->plaintext); - $this->items[$i]->author = trim($article->find('author', 0)->innertext); - $this->items[$i]->timestamp = strtotime($article->find('updated', 0)->innertext); - $article_html = $this->file_get_html($this->items[$i]->uri); - $this->items[$i]->content = clearContent($article_html->find('article', 0)->innertext); - if(empty($this->items[$i]->content)) - $this->items[$i]->content = clearContent($article_html->find('.single-content', 0)->innertext); // another common content div - if(empty($this->items[$i]->content)) - $this->items[$i]->content = clearContent($article_html->find('.post', 0)->innertext); // for old WordPress themes without HTML5 + $item = new \Item(); + $article = $this->ReplaceLinkTagsWithUrlTags($article); + + if($type === WORDPRESS_TYPE_RSS){ + $item->uri = $article->find('url', 0)->innertext; // 'link' => 'url'! + $item->title = $article->find('title', 0)->plaintext; + $item->author = trim($this->StripCDATA($article->find('dc:creator', 0)->innertext)); + $item->timestamp = strtotime($article->find('pubDate', 0)->innertext); + } else { + $item->uri = $article->find('url', 0)->getAttribute('href'); // 'link' => 'url'! + $item->title = $this->StripCDATA($article->find('title', 0)->plaintext); + $item->author = trim($article->find('author', 0)->innertext); + $item->timestamp = strtotime($article->find('updated', 0)->innertext); + } + + $article_html = $this->file_get_html($item->uri); + + // Attempt to find most common content div + if(empty($item->content)){ + $article = $article_html->find('article', 0); + if(!empty($article)){ + $item->content = $this->ClearContent($article->innertext); + } + } + + // another common content div + if(empty($item->content)){ + $article = $article_html->find('.single-content', 0); + if(!empty($article)){ + $item->content = $this->ClearContent($article->innertext); + } + } + + // for old WordPress themes without HTML5 + if(empty($item->content)){ + $article = $article_html->find('.post', 0); + if(!empty($article)){ + $item->content = $this->ClearContent($article->innertext); + } + } + + $this->items[] = $item; $i++; } } - } - else { + } else { $this->returnError("Sorry, {$this->url} doesn't seem to be a Wordpress blog.", 404); } } public function getName() { - return "{$this->name} - Wordpress Bridge"; + return "{$this->sitename} - Wordpress Bridge"; } public function getURI() { @@ -93,6 +154,4 @@ class WordPressBridge extends BridgeAbstract { private function processParams($param) { $this->url = $param['url']; } - } -