From 0a008c4eb3c9e415fd55cc174ec5319a66265180 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Thu, 4 Aug 2016 20:06:53 +0200 Subject: [PATCH 01/10] [WordPress] Support atom AND rss for feed replies Some sites return RSS feeds instead of ATOM feeds even though ATOM feeds were specifically requested (/feeds/atom)! This bridge will now detect ATOM/RSS feeds and change behavior accordingly. --- bridges/WordPressBridge.php | 52 ++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 7a1b12e3..27c231d5 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -1,4 +1,6 @@ name = "Wordpress Bridge"; $this->uri = "https://wordpress.org/"; $this->description = "Returns the 3 newest full posts of a Wordpress blog"; - $this->update = "2016-08-02"; + $this->update = "2016-08-04"; $this->parameters[] = '[ @@ -22,6 +24,24 @@ class WordPressBridge extends BridgeAbstract { } + // Returns the content type for a given html dom + function DetectContentType($html){ + if($html->find('entry')) + return WORDPRESS_TYPE_ATOM; + if($html->find('item')) + return WORDPRESS_TYPE_RSS; + return WORDPRESS_TYPE_ATOM; // Make ATOM default + } + + // Replaces all 'link' tags with 'url' for simplehtmldom to actually find 'links' ('url') + function ReplaceLinkTagsWithUrlTags($element){ + // We need to fix the 'link' tag as simplehtmldom cannot parse it (just rename it and load back as dom) + $element_text = $element->outertext; + $element_text = str_replace('', '', $element_text); + $element_text = str_replace('', '', $element_text); + return str_get_html($element_text); + } + public function collectData(array $param) { function StripCDATA($string) { @@ -44,17 +64,35 @@ class WordPressBridge extends BridgeAbstract { $this->url = $this->url.'/feed/atom'; $html = $this->file_get_html($this->url) or $this->returnError("Could not request {$this->url}.", 404); - $posts = $html->find('entry'); + + // Notice: We requested an ATOM feed, however some sites return RSS feeds instead! + $type = $this->DetectContentType($html); + + if($type === WORDPRESS_TYPE_RSS) + $posts = $html->find('item'); + else + $posts = $html->find('entry'); + + if(!empty($posts) ) { $this->name = $html->find('title', 0)->plaintext; $i=0; - foreach ($html->find('entry') as $article) { + foreach ($posts as $article) { if($i < 3) { - $this->items[$i]->uri = $article->find('link', 0)->getAttribute('href'); - $this->items[$i]->title = StripCDATA($article->find('title', 0)->plaintext); - $this->items[$i]->author = trim($article->find('author', 0)->innertext); - $this->items[$i]->timestamp = strtotime($article->find('updated', 0)->innertext); + $article = $this->ReplaceLinkTagsWithUrlTags($article); + + if($type === WORDPRESS_TYPE_RSS){ + $this->items[$i]->uri = $article->find('url', 0)->innertext; // 'link' => 'url'! + $this->items[$i]->title = $article->find('title', 0)->plaintext; + $this->items[$i]->author = trim($article->find('dc:creator', 0)->innertext); + $this->items[$i]->timestamp = strtotime($article->find('pubDate', 0)->innertext); + } else { + $this->items[$i]->uri = $article->find('url', 0)->getAttribute('href'); // 'link' => 'url'! + $this->items[$i]->title = StripCDATA($article->find('title', 0)->plaintext); + $this->items[$i]->author = trim($article->find('author', 0)->innertext); + $this->items[$i]->timestamp = strtotime($article->find('updated', 0)->innertext); + } $article_html = $this->file_get_html($this->items[$i]->uri); $this->items[$i]->content = clearContent($article_html->find('article', 0)->innertext); if(empty($this->items[$i]->content)) From 2672b96e11b775b37301cdb37d8b4b401bb8b03d Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Thu, 4 Aug 2016 20:12:51 +0200 Subject: [PATCH 02/10] [WordPress] Initialize item before assigning values This solves warning: "Creating default object from empty value" --- bridges/WordPressBridge.php | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 27c231d5..70e7a3fb 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -80,26 +80,29 @@ class WordPressBridge extends BridgeAbstract { foreach ($posts as $article) { if($i < 3) { + $item = new \Item(); + $article = $this->ReplaceLinkTagsWithUrlTags($article); if($type === WORDPRESS_TYPE_RSS){ - $this->items[$i]->uri = $article->find('url', 0)->innertext; // 'link' => 'url'! - $this->items[$i]->title = $article->find('title', 0)->plaintext; - $this->items[$i]->author = trim($article->find('dc:creator', 0)->innertext); - $this->items[$i]->timestamp = strtotime($article->find('pubDate', 0)->innertext); + $item->uri = $article->find('url', 0)->innertext; // 'link' => 'url'! + $item->title = $article->find('title', 0)->plaintext; + $item->author = trim($article->find('dc:creator', 0)->innertext); + $item->timestamp = strtotime($article->find('pubDate', 0)->innertext); } else { - $this->items[$i]->uri = $article->find('url', 0)->getAttribute('href'); // 'link' => 'url'! - $this->items[$i]->title = StripCDATA($article->find('title', 0)->plaintext); - $this->items[$i]->author = trim($article->find('author', 0)->innertext); - $this->items[$i]->timestamp = strtotime($article->find('updated', 0)->innertext); + $item->uri = $article->find('url', 0)->getAttribute('href'); // 'link' => 'url'! + $item->title = StripCDATA($article->find('title', 0)->plaintext); + $item->author = trim($article->find('author', 0)->innertext); + $item->timestamp = strtotime($article->find('updated', 0)->innertext); } - $article_html = $this->file_get_html($this->items[$i]->uri); - $this->items[$i]->content = clearContent($article_html->find('article', 0)->innertext); - if(empty($this->items[$i]->content)) - $this->items[$i]->content = clearContent($article_html->find('.single-content', 0)->innertext); // another common content div - if(empty($this->items[$i]->content)) - $this->items[$i]->content = clearContent($article_html->find('.post', 0)->innertext); // for old WordPress themes without HTML5 - + $article_html = $this->file_get_html($item->uri); + $item->content = clearContent($article_html->find('article', 0)->innertext); + if(empty($item->content)) + $item->content = clearContent($article_html->find('.single-content', 0)->innertext); // another common content div + if(empty($item->content)) + $item->content = clearContent($article_html->find('.post', 0)->innertext); // for old WordPress themes without HTML5 + + $this->items[] = $item; $i++; } } From 21523eb549f0f94d4d5c87dffff793ffce9e3b96 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Thu, 4 Aug 2016 20:15:28 +0200 Subject: [PATCH 03/10] [WordPress] Change all nested functions to member functions --- bridges/WordPressBridge.php | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 70e7a3fb..50d54c43 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -42,20 +42,19 @@ class WordPressBridge extends BridgeAbstract { return str_get_html($element_text); } + function StripCDATA($string) { + $string = str_replace('', '', $string); + return $string; + } + + function ClearContent($content) { + $content = preg_replace('//', '', $content); + $content = preg_replace('/
', '', $string); - return $string; - } - - function clearContent($content) { - $content = preg_replace('//', '', $content); - $content = preg_replace('/
processParams($param); if (!$this->hasUrl()) { @@ -91,16 +90,16 @@ class WordPressBridge extends BridgeAbstract { $item->timestamp = strtotime($article->find('pubDate', 0)->innertext); } else { $item->uri = $article->find('url', 0)->getAttribute('href'); // 'link' => 'url'! - $item->title = StripCDATA($article->find('title', 0)->plaintext); + $item->title = $this->StripCDATA($article->find('title', 0)->plaintext); $item->author = trim($article->find('author', 0)->innertext); $item->timestamp = strtotime($article->find('updated', 0)->innertext); } $article_html = $this->file_get_html($item->uri); - $item->content = clearContent($article_html->find('article', 0)->innertext); + $item->content = $this->ClearContent($article_html->find('article', 0)->innertext); if(empty($item->content)) - $item->content = clearContent($article_html->find('.single-content', 0)->innertext); // another common content div + $item->content = $this->ClearContent($article_html->find('.single-content', 0)->innertext); // another common content div if(empty($item->content)) - $item->content = clearContent($article_html->find('.post', 0)->innertext); // for old WordPress themes without HTML5 + $item->content = $this->ClearContent($article_html->find('.post', 0)->innertext); // for old WordPress themes without HTML5 $this->items[] = $item; $i++; From 455b98757cd71d29f76df6474e552f67dde95b47 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Thu, 4 Aug 2016 20:40:38 +0200 Subject: [PATCH 04/10] [WordPress] Don't attempt to load pages with missing .article Some sites use WordPress but don't provide pages with articles. Instead of throwing internal errors we just don't return any content. --- bridges/WordPressBridge.php | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 50d54c43..eee3c973 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -39,6 +39,7 @@ class WordPressBridge extends BridgeAbstract { $element_text = $element->outertext; $element_text = str_replace('', '', $element_text); $element_text = str_replace('', '', $element_text); + $element_text = str_replace('author = trim($article->find('author', 0)->innertext); $item->timestamp = strtotime($article->find('updated', 0)->innertext); } + $article_html = $this->file_get_html($item->uri); - $item->content = $this->ClearContent($article_html->find('article', 0)->innertext); - if(empty($item->content)) - $item->content = $this->ClearContent($article_html->find('.single-content', 0)->innertext); // another common content div - if(empty($item->content)) - $item->content = $this->ClearContent($article_html->find('.post', 0)->innertext); // for old WordPress themes without HTML5 - + + $article = $article_html->find('article', 0); + if(!empty($article)){ + $item->content = $this->ClearContent($article->innertext); + if(empty($item->content)) + $item->content = $this->ClearContent($article_html->find('.single-content', 0)->innertext); // another common content div + if(empty($item->content)) + $item->content = $this->ClearContent($article_html->find('.post', 0)->innertext); // for old WordPress themes without HTML5 + } $this->items[] = $item; $i++; } From d944558a3dcc417b6c756a6f2a8919fe1f28c073 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Thu, 4 Aug 2016 20:45:21 +0200 Subject: [PATCH 05/10] [WordPress] Fix indentation and remove empty lines --- bridges/WordPressBridge.php | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index eee3c973..11cddbfc 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -21,7 +21,6 @@ class WordPressBridge extends BridgeAbstract { "identifier" : "url" } ]'; - } // Returns the content type for a given html dom @@ -62,7 +61,7 @@ class WordPressBridge extends BridgeAbstract { $this->returnError('You must specify a URL', 400); } - $this->url = $this->url.'/feed/atom'; + $this->url = $this->url.'/feed/atom'; $html = $this->file_get_html($this->url) or $this->returnError("Could not request {$this->url}.", 404); // Notice: We requested an ATOM feed, however some sites return RSS feeds instead! @@ -73,10 +72,10 @@ class WordPressBridge extends BridgeAbstract { else $posts = $html->find('entry'); + if(!empty($posts) ) { + $this->name = $html->find('title', 0)->plaintext; + $i=0; - if(!empty($posts) ) { - $this->name = $html->find('title', 0)->plaintext; - $i=0; foreach ($posts as $article) { if($i < 3) { @@ -96,7 +95,7 @@ class WordPressBridge extends BridgeAbstract { $item->timestamp = strtotime($article->find('updated', 0)->innertext); } - $article_html = $this->file_get_html($item->uri); + $article_html = $this->file_get_html($item->uri); $article = $article_html->find('article', 0); if(!empty($article)){ @@ -110,8 +109,7 @@ class WordPressBridge extends BridgeAbstract { $i++; } } - } - else { + } else { $this->returnError("Sorry, {$this->url} doesn't seem to be a Wordpress blog.", 404); } } @@ -138,6 +136,4 @@ class WordPressBridge extends BridgeAbstract { private function processParams($param) { $this->url = $param['url']; } - } - From 2c41ed550def91348049815f025f1df0d6781ecc Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Thu, 4 Aug 2016 21:06:12 +0200 Subject: [PATCH 06/10] [WordPress] Fix content loading --- bridges/WordPressBridge.php | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 11cddbfc..27afe0e4 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -86,7 +86,7 @@ class WordPressBridge extends BridgeAbstract { if($type === WORDPRESS_TYPE_RSS){ $item->uri = $article->find('url', 0)->innertext; // 'link' => 'url'! $item->title = $article->find('title', 0)->plaintext; - $item->author = trim($article->find('dc:creator', 0)->innertext); + $item->author = trim($this->StripCDATA($article->find('dc:creator', 0)->innertext)); $item->timestamp = strtotime($article->find('pubDate', 0)->innertext); } else { $item->uri = $article->find('url', 0)->getAttribute('href'); // 'link' => 'url'! @@ -97,14 +97,30 @@ class WordPressBridge extends BridgeAbstract { $article_html = $this->file_get_html($item->uri); - $article = $article_html->find('article', 0); - if(!empty($article)){ - $item->content = $this->ClearContent($article->innertext); - if(empty($item->content)) - $item->content = $this->ClearContent($article_html->find('.single-content', 0)->innertext); // another common content div - if(empty($item->content)) - $item->content = $this->ClearContent($article_html->find('.post', 0)->innertext); // for old WordPress themes without HTML5 + // Attempt to find most common content div + if(empty($item->content)){ + $article = $article_html->find('article', 0); + if(!empty($article)){ + $item->content = $this->ClearContent($article->innertext); + } } + + // another common content div + if(empty($item->content)){ + $article = $article_html->find('.single-content', 0); + if(!empty($article)){ + $item->content = $this->ClearContent($article->innertext); + } + } + + // for old WordPress themes without HTML5 + if(empty($item->content)){ + $article = $article_html->find('.post', 0); + if(!empty($article)){ + $item->content = $this->ClearContent($article->innertext); + } + } + $this->items[] = $item; $i++; } From 870ef6f6fc9216b83629a7cb32defc5db78070be Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Thu, 4 Aug 2016 21:42:06 +0200 Subject: [PATCH 07/10] [WordPress] Remove forms and improve script removal Some sites contain scripts like these, that were not captured by the previous implementation: --- bridges/WordPressBridge.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 27afe0e4..0ba18517 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -49,8 +49,9 @@ class WordPressBridge extends BridgeAbstract { } function ClearContent($content) { - $content = preg_replace('//', '', $content); + $content = preg_replace('/]*>[^<]*<\/script>/', '', $content); $content = preg_replace('/
/', '', $content); return $content; } From 7afee0e277d82cba2da2eeb4146cce98640665c7 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Thu, 4 Aug 2016 21:56:11 +0200 Subject: [PATCH 08/10] [WordPress] Fix site name on bridge content --- bridges/WordPressBridge.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 0ba18517..35ef3bfd 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -4,6 +4,7 @@ define('WORDPRESS_TYPE_RSS', 2); // Content is of type RSS class WordPressBridge extends BridgeAbstract { private $url; + public $sitename; // Name of the site public function loadMetadatas() { @@ -74,7 +75,7 @@ class WordPressBridge extends BridgeAbstract { $posts = $html->find('entry'); if(!empty($posts) ) { - $this->name = $html->find('title', 0)->plaintext; + $this->sitename = $html->find('title', 0)->plaintext; $i=0; foreach ($posts as $article) { @@ -132,7 +133,7 @@ class WordPressBridge extends BridgeAbstract { } public function getName() { - return "{$this->name} - Wordpress Bridge"; + return "{$this->sitename} - Wordpress Bridge"; } public function getURI() { From 12abf9d455a3326760345727c64dbbfc0b3d9753 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Thu, 4 Aug 2016 21:57:13 +0200 Subject: [PATCH 09/10] [LeMotDuJour] remove Bridge: Supported by WordPress bridge http://www.lemotdujour.com doesn't seem to be active for the last two years (last entry is dated 17. oct 2014). WordPress will return the headlines which seems to be sufficient for now. --- bridges/LeMotDuJourBridge.php | 55 ----------------------------------- 1 file changed, 55 deletions(-) delete mode 100644 bridges/LeMotDuJourBridge.php diff --git a/bridges/LeMotDuJourBridge.php b/bridges/LeMotDuJourBridge.php deleted file mode 100644 index d1215941..00000000 --- a/bridges/LeMotDuJourBridge.php +++ /dev/null @@ -1,55 +0,0 @@ -maintainer = "qwertygc"; - $this->name = "LeMotDuJour Bridge"; - $this->uri = "http://www.lemotdujour.com/"; - $this->description = "Returns the newest articles."; - $this->update = "2014-05-25"; - - } - - public function collectData(array $param){ - - function StripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } - function ExtractContent($url) { - $html2 = $this->file_get_html($url); - $text = $html2->find('div.single-contenu', 0)->innertext; - return $text; - } - $html = $this->file_get_html('http://feeds2.feedburner.com/lemotdujour/lemotdujour') or $this->returnError('Could not request LeMotDuJour.', 404); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 10) { - $item = new \Item(); - $item->title = StripCDATA($element->find('title', 0)->innertext); - $item->uri = StripCDATA($element->find('guid', 0)->plaintext); - $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); - $item->content = ExtractContent($item->uri); - $this->items[] = $item; - $limit++; - } - } - - } - - public function getName(){ - return 'LeMotDuJour Bridge'; - } - - public function getURI(){ - return 'http://lemotdujour.com/'; - } - - public function getCacheDuration(){ - return 3600*2; // 2 hours - // return 0; // 2 hours - } -} From 02e169584a8dbfc243c12e4e8c0b8587a6cf1aca Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Thu, 4 Aug 2016 22:00:51 +0200 Subject: [PATCH 10/10] [Raymond] remove bridge: supported by WordPress bridge --- bridges/RaymondBridge.php | 53 --------------------------------------- 1 file changed, 53 deletions(-) delete mode 100644 bridges/RaymondBridge.php diff --git a/bridges/RaymondBridge.php b/bridges/RaymondBridge.php deleted file mode 100644 index 9e96a54f..00000000 --- a/bridges/RaymondBridge.php +++ /dev/null @@ -1,53 +0,0 @@ -maintainer = "pit-fgfjiudghdf"; - $this->name = "Raymond"; - $this->uri = "http://www.raymond.cc"; - $this->description = "Returns the 3 newest posts from Raymond.cc (full text)"; - $this->update = "2014-05-26"; - - } - - public function collectData(array $param){ - function raymondStripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } - function raymondExtractContent($url) { - $html2 = $this->file_get_html($url); - $text = $html2->find('div.entry-content', 0)->innertext; - $text = preg_replace('/class="ad".*/', '', $text); - $text = strip_tags($text, '

'); - $text = str_replace('(adsbygoogle = window.adsbygoogle || []).push({});', '', $text); - return $text; - } - $html = $this->file_get_html('http://www.raymond.cc/blog/feed') or $this->returnError('Could not request raymond.', 404); - $limit = 0; - foreach($html->find('item') as $element) { - if($limit < 3) { - $item = new \Item(); - $item->title = raymondStripCDATA($element->find('title', 0)->innertext); - $item->uri = raymondStripCDATA($element->find('guid', 0)->plaintext); - $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); - $item->content = raymondExtractContent($item->uri); - $this->items[] = $item; - $limit++; - } - } - - } - public function getName(){ - return 'raymond'; - } - public function getURI(){ - return 'http://www.raymond.cc/blog'; - } - public function getCacheDuration(){ - return 3600*12; // 12 hour - } -} -