From 179e73fb80ce9ac0944272590e5661152984b750 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Mon, 5 Sep 2016 18:43:56 +0200 Subject: [PATCH 1/3] [bridges] Change to extend from FeedExpander --- bridges/CADBridge.php | 41 ++---- bridges/CommonDreamsBridge.php | 37 ++---- bridges/DauphineLibereBridge.php | 62 ++++----- bridges/DeveloppezDotComBridge.php | 33 ++--- bridges/FuturaSciencesBridge.php | 178 ++++++++++++-------------- bridges/LeJournalDuGeekBridge.php | 34 ++--- bridges/LeMondeInformatiqueBridge.php | 74 ++++------- bridges/LichessBridge.php | 37 ++---- bridges/NextInpactBridge.php | 34 ++--- bridges/NextgovBridge.php | 84 ++++++------ bridges/NiceMatinBridge.php | 39 ++---- bridges/NumeramaBridge.php | 46 +++---- 12 files changed, 262 insertions(+), 437 deletions(-) diff --git a/bridges/CADBridge.php b/bridges/CADBridge.php index 47ff165a..eb05fd16 100644 --- a/bridges/CADBridge.php +++ b/bridges/CADBridge.php @@ -1,12 +1,22 @@ collectExpandableDatas('http://cdn2.cad-comic.com/rss.xml'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->CADExtractContent($item['uri']); + return $item; + } + private function CADExtractContent($url) { - $html3 = $this->getSimpleHTMLDOM($url); + $html3 = $this->get_cached($url); // The request might fail due to missing https support or wrong URL if($html3 == false) @@ -32,33 +42,6 @@ class CADBridge extends BridgeAbstract{ return ''; } - public function collectData(){ - function CADUrl($string) { - $html2 = explode("\"", $string); - $string = $html2[1]; - if (substr($string,0,4) != 'http') - return 'notanurl'; - return $string; - } - - $html = $this->getSimpleHTMLDOM('http://cdn2.cad-comic.com/rss.xml') or $this->returnServerError('Could not request CAD.'); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 5) { - $item = array(); - $item['title'] = $element->find('title', 0)->innertext; - $item['uri'] = CADUrl($element->find('description', 0)->innertext); - if ($item['uri'] != 'notanurl') { - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->CADExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } - } - } - public function getCacheDuration(){ return 3600*2; // 2 hours } diff --git a/bridges/CommonDreamsBridge.php b/bridges/CommonDreamsBridge.php index 446a6df0..e621db41 100644 --- a/bridges/CommonDreamsBridge.php +++ b/bridges/CommonDreamsBridge.php @@ -1,39 +1,26 @@ collectExpandableDatas('http://www.commondreams.org/rss.xml'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->CommonDreamsExtractContent($item['uri']); + return $item; + } + private function CommonDreamsExtractContent($url) { - $html3 = $this->getSimpleHTMLDOM($url); + $html3 = $this->get_cached($url); $text = $html3->find('div[class=field--type-text-with-summary]', 0)->innertext; $html3->clear(); unset ($html3); return $text; } - - public function collectData(){ - - function CommonDreamsUrl($string) { - $html2 = explode(" ", $string); - $string = $html2[2] . "/node/" . $html2[0]; - return $string; - } - - $html = $this->getSimpleHTMLDOM('http://www.commondreams.org/rss.xml') or $this->returnServerError('Could not request CommonDreams.'); - $limit = 0; - foreach($html->find('item') as $element) { - if($limit < 4) { - $item = array(); - $item['title'] = $element->find('title', 0)->innertext; - $item['uri'] = CommonDreamsUrl($element->find('guid', 0)->innertext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->CommonDreamsExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } - } } diff --git a/bridges/DauphineLibereBridge.php b/bridges/DauphineLibereBridge.php index 143a6c0a..d8e10ddb 100644 --- a/bridges/DauphineLibereBridge.php +++ b/bridges/DauphineLibereBridge.php @@ -1,10 +1,10 @@ array( @@ -30,41 +30,31 @@ class DauphineLibereBridge extends BridgeAbstract { ) )); - private function ExtractContent($url, $context) { - $html2 = $this->getSimpleHTMLDOM($url); - $text = $html2->find('div.column', 0)->innertext; - $text = preg_replace('@]*?>.*?@si', '', $text); - return $text; - } + public function collectData(){ + $url = self::URI . 'rss'; - public function collectData(){ + if (empty($this->getInput('u'))) { + $url = self::URI . $this->getInput('u') . '/rss'; + } - $context = stream_context_create($opts); + $this->collectExpandableDatas($url); + } - if (empty($this->getInput('u'))) { - $html = $this->getSimpleHTMLDOM(self::URI.$this->getInput('u').'/rss') - or $this->returnServerError('Could not request DauphineLibere.'); - } else { - $html = $this->getSimpleHTMLDOM(self::URI.'rss') - or $this->returnServerError('Could not request DauphineLibere.'); - } - $limit = 0; + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->ExtractContent($item['uri']); + return $item; + } - foreach($html->find('item') as $element) { - if($limit < 10) { - $item = array(); - $item['title'] = $element->find('title', 0)->innertext; - $item['uri'] = $element->find('guid', 0)->plaintext; - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->ExtractContent($item['uri'], $context); - $this->items[] = $item; - $limit++; - } - } - } + private function ExtractContent($url) { + $html2 = $this->getSimpleHTMLDOM($url); + $text = $html2->find('div.column', 0)->innertext; + $text = preg_replace('@]*?>.*?@si', '', $text); + return $text; + } - public function getCacheDuration(){ - return 3600*2; // 2 hours - } + public function getCacheDuration(){ + return 3600*2; // 2 hours + } } ?> diff --git a/bridges/DeveloppezDotComBridge.php b/bridges/DeveloppezDotComBridge.php index 48e29741..52e52db1 100644 --- a/bridges/DeveloppezDotComBridge.php +++ b/bridges/DeveloppezDotComBridge.php @@ -1,11 +1,21 @@ collectExpandableDatas(self::URI . 'index/rss'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->DeveloppezDotComExtractContent($item['uri']); + return $item; + } + private function DeveloppezDotComStripCDATA($string) { $string = str_replace('', '', $string); @@ -32,31 +42,12 @@ class DeveloppezDotComBridge extends BridgeAbstract{ } private function DeveloppezDotComExtractContent($url) { - $articleHTMLContent = $this->getSimpleHTMLDOM($url); + $articleHTMLContent = $this->get_cached($url); $text = $this->convert_smart_quotes($articleHTMLContent->find('div.content', 0)->innertext); $text = utf8_encode($text); return trim($text); } - public function collectData(){ - $rssFeed = $this->getSimpleHTMLDOM(self::URI.'index/rss') - or $this->returnServerError('Could not request '.self::URI.'index/rss'); - $limit = 0; - - foreach($rssFeed->find('item') as $element) { - if($limit < 10) { - $item = array(); - $item['title'] = $this->DeveloppezDotComStripCDATA($element->find('title', 0)->innertext); - $item['uri'] = $this->DeveloppezDotComStripCDATA($element->find('guid', 0)->plaintext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $content = $this->DeveloppezDotComExtractContent($item['uri']); - $item['content'] = strlen($content) ? $content : $element->description; //In case of it is a tutorial, we just keep the original description - $this->items[] = $item; - $limit++; - } - } - } - public function getCacheDuration(){ return 1800; // 30min } diff --git a/bridges/FuturaSciencesBridge.php b/bridges/FuturaSciencesBridge.php index e4c8471f..beff9c8a 100644 --- a/bridges/FuturaSciencesBridge.php +++ b/bridges/FuturaSciencesBridge.php @@ -1,5 +1,5 @@ getInput('feed') . '.xml'; + $this->collectExpandableDatas($url); + } - function StripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']); + $article = $this->get_cached($item['uri']) + or $this->returnServerError('Could not request Futura-Sciences: ' . $item['uri']); + $item['content'] = $this->ExtractArticleContent($article); + $item['author'] = empty($this->ExtractAuthor($article)) ? $item['author'] : $this->ExtractAuthor($article); + return $item; + } - function StripWithDelimiters($string, $start, $end) { - while (strpos($string, $start) !== false) { - $section_to_remove = substr($string, strpos($string, $start)); - $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + function StripWithDelimiters($string, $start, $end) { + while (strpos($string, $start) !== false) { + $section_to_remove = substr($string, strpos($string, $start)); + $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + $string = str_replace($section_to_remove, '', $string); + } return $string; + } + + function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { + $open_tag = '<'.$tag_name; + $close_tag = ''; + $close_tag_length = strlen($close_tag); + if (strpos($tag_start, $open_tag) === 0) { + while (strpos($string, $tag_start) !== false) { + $max_recursion = 100; + $section_to_remove = null; + $section_start = strpos($string, $tag_start); + $search_offset = $section_start; + do { + $max_recursion--; + $section_end = strpos($string, $close_tag, $search_offset); + $search_offset = $section_end + $close_tag_length; + $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); + $open_tag_count = substr_count($section_to_remove, $open_tag); + $close_tag_count = substr_count($section_to_remove, $close_tag); + } while ($open_tag_count > $close_tag_count && $max_recursion > 0); $string = str_replace($section_to_remove, '', $string); - } return $string; - } - - function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { - $open_tag = '<'.$tag_name; - $close_tag = ''; - $close_tag_length = strlen($close_tag); - if (strpos($tag_start, $open_tag) === 0) { - while (strpos($string, $tag_start) !== false) { - $max_recursion = 100; - $section_to_remove = null; - $section_start = strpos($string, $tag_start); - $search_offset = $section_start; - do { - $max_recursion--; - $section_end = strpos($string, $close_tag, $search_offset); - $search_offset = $section_end + $close_tag_length; - $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); - $open_tag_count = substr_count($section_to_remove, $open_tag); - $close_tag_count = substr_count($section_to_remove, $close_tag); - } while ($open_tag_count > $close_tag_count && $max_recursion > 0); - $string = str_replace($section_to_remove, '', $string); - } - } - return $string; - } - - // Extracts the author from an article or element - function ExtractAuthor($article, $element){ - $article_author = $article->find('span.author', 0); - if($article_author){ - $authorname = trim(str_replace(', Futura-Sciences', '', $article_author->plaintext)); - if(empty($authorname)){ - $element_author = $element->find('author', 0); - if($element_author) - $authorname = StripCDATA($element_author->plaintext); - else - return ''; - } - return $authorname; - } - return ''; - } - - $url = $this->getURI().'rss/'.$this->getInput('feed').'.xml'; - - $html = $this->getSimpleHTMLDOM($url) - or $this->returnServerError('Could not request Futura-Sciences: '.$url); - $limit = 0; - - foreach($html->find('item') as $element) { - if ($limit < 10) { - $article_url = str_replace('#xtor=RSS-8', '', StripCDATA($element->find('guid', 0)->plaintext)); - $article = $this->getSimpleHTMLDOM($article_url) or $this->returnServerError('Could not request Futura-Sciences: '.$article_url); - $contents = $article->find('div.content', 0)->innertext; - - foreach (array( - '
'); - $contents = StripWithDelimiters($contents, '
StripRecursiveHTMLSection($contents , 'div', $div_start); + } + + $contents = $this->StripWithDelimiters($contents, '
'); + $contents = $this->StripWithDelimiters($contents, '

'); - return $article_html; - } + function StripWithDelimiters($string, $start, $end) { + while (strpos($string, $start) !== false) { + $section_to_remove = substr($string, strpos($string, $start)); + $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + $string = str_replace($section_to_remove, '', $string); + } return $string; + } - $html = $this->getSimpleHTMLDOM(self::URI.'rss/rss.xml') - or $this->returnServerError('Could not request LeMondeInformatique: ' - .self::URI.'rss/rss.xml'); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 5) { - - //Retrieve article details - $article_uri = $element->innertext; - $article_uri = substr($article_uri, strpos($article_uri, '') + 6); - $article_uri = substr($article_uri, 0, strpos($article_uri, '')); - $article_html = $this->getSimpleHTMLDOM($article_uri) or $this->returnServerError('Could not request LeMondeInformatique: '.$article_uri); - $article_content = CleanArticle($article_html->find('div#article', 0)->innertext); - $article_title = $article_html->find('h1.cleanprint-title', 0)->plaintext; - - //Build and add final item - $item = array(); - $item['uri'] = $article_uri; - $item['title'] = $article_title; - $item['author'] = StripCDATA($element->find('dc:creator', 0)->innertext); - $item['timestamp'] = strtotime($element->find('dc:date', 0)->plaintext); - $item['content'] = $article_content; - $this->items[] = $item; - $limit++; - } - } + function CleanArticle($article_html) { + $article_html = $this->StripWithDelimiters($article_html, ''); + $article_html = $this->StripWithDelimiters($article_html, '

'); + return $article_html; } public function getCacheDuration() { diff --git a/bridges/LichessBridge.php b/bridges/LichessBridge.php index f74c2bde..57108bd9 100644 --- a/bridges/LichessBridge.php +++ b/bridges/LichessBridge.php @@ -1,39 +1,22 @@ getSimpleHTMLDOM(self::URI.'.atom') - or $this->returnServerError('Could not retrieve Lichess blog feed.'); - - $posts_loaded = 0; - foreach($xml_feed->find('entry') as $entry) - { - if ($posts_loaded < 5) - { - $item = array(); - - $item['title'] = html_entity_decode($entry->find('title', 0)->innertext); - $item['author'] = $entry->find('author', 0)->find('name', 0)->innertext; - $item['uri'] = $entry->find('id', 0)->plaintext; - $item['timestamp'] = strtotime($entry->find('published', 0)->plaintext); - - $item['content'] = $this->retrieve_lichess_post($item['uri']); - - $this->items[] = $item; - $posts_loaded++; - } - } + public function collectData(){ + $this->collectExpandableDatas(self::URI . '.atom'); } - private function retrieve_lichess_post($blog_post_uri) - { + protected function parseItem($newsItem){ + $item = $this->parseATOMItem($newsItem); + $item['content'] = $this->retrieve_lichess_post($item['uri']); + return $item; + } + + private function retrieve_lichess_post($blog_post_uri){ if($this->get_cached_time($blog_post_uri) <= strtotime('-24 hours')) $this->remove_from_cache($blog_post_uriuri); diff --git a/bridges/NextInpactBridge.php b/bridges/NextInpactBridge.php index 8c35753d..a24a02e4 100644 --- a/bridges/NextInpactBridge.php +++ b/bridges/NextInpactBridge.php @@ -1,19 +1,23 @@ ', '', $string); - return $string; + public function collectData(){ + $this->collectExpandableDatas(self::URI . 'rss/news.xml'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->ExtractContent($item['uri']); + return $item; } private function ExtractContent($url) { - $html2 = $this->getSimpleHTMLDOM($url); + $html2 = $this->get_cached($url); $text = '

'.$html2->find('span.sub_title', 0)->innertext.'

' .'

-

' .'
'.$html2->find('div[itemprop=articleBody]', 0)->innertext.'
'; @@ -22,22 +26,4 @@ class NextInpactBridge extends BridgeAbstract { $text = $text.'

'.$premium_article->innertext.'

'; return $text; } - - public function collectData(){ - $html = $this->getSimpleHTMLDOM(self::URI.'rss/news.xml') or $this->returnServerError('Could not request NextInpact.'); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 3) { - $item = array(); - $item['title'] = $this->StripCDATA($element->find('title', 0)->innertext); - $item['uri'] = $this->StripCDATA($element->find('guid', 0)->plaintext); - $item['author'] = $this->StripCDATA($element->find('creator', 0)->innertext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->ExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } - } } diff --git a/bridges/NextgovBridge.php b/bridges/NextgovBridge.php index ee4f2996..dee8c370 100644 --- a/bridges/NextgovBridge.php +++ b/bridges/NextgovBridge.php @@ -1,5 +1,5 @@ collectExpandableDatas(self::URI . 'rss/' . $this->getInput('category') . '/'); + } - function ExtractFromDelimiters($string, $start, $end) { - if (strpos($string, $start) !== false) { - $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); - $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); - return $section_retrieved; - } return false; - } + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); - function StripWithDelimiters($string, $start, $end) { - while (strpos($string, $start) !== false) { - $section_to_remove = substr($string, strpos($string, $start)); - $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); - $string = str_replace($section_to_remove, '', $string); - } return $string; - } + $item['content'] = ''; - $category = $this->getInput('category'); - $url = $this->getURI().'rss/'.$category.'/'; - $html = $this->getSimpleHTMLDOM($url) or $this->returnServerError('Could not request Nextgov: '.$url); - $limit = 0; - - foreach ($html->find('item') as $element) { - if ($limit >= 10) { - break; + $namespaces = $newsItem->getNamespaces(true); + if(isset($namespaces['media'])){ + $media = $newsItem->children($namespaces['media']); + if(isset($media->content)){ + $attributes = $media->content->attributes(); + $item['content'] = ''; } - - $article_url = ExtractFromDelimiters($element->innertext, '', ''); - $article_author = ExtractFromDelimiters($element->innertext, 'dc/elements/1.1/">', ''); - $article_title = $element->find('title', 0)->plaintext; - $article_subtitle = $element->find('description', 0)->plaintext; - $article_timestamp = strtotime($element->find('pubDate', 0)->plaintext); - $article_thumbnail = ExtractFromDelimiters($element->innertext, 'getSimpleHTMLDOM($article_url) or $this->returnServerError('Could not request Nextgov: '.$article_url); - - $contents = $article->find('div.wysiwyg', 0)->innertext; - $contents = StripWithDelimiters($contents, '
', '
'); - $contents = StripWithDelimiters($contents, ''); //ad outer div - $contents = StripWithDelimiters($contents, ''); - $contents = ($article_thumbnail == '' ? '' : '

') - .'

'.$article_subtitle.'

' - .trim($contents); - - $item = array(); - $item['uri'] = $article_url; - $item['title'] = $article_title; - $item['author'] = $article_author; - $item['timestamp'] = $article_timestamp; - $item['content'] = $contents; - $this->items[] = $item; - $limit++; } + + $item['content'] .= $this->ExtractContent($item['uri']); + return $item; + } + + private function StripWithDelimiters($string, $start, $end) { + while (strpos($string, $start) !== false) { + $section_to_remove = substr($string, strpos($string, $start)); + $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + $string = str_replace($section_to_remove, '', $string); + } return $string; + } + + private function ExtractContent($url){ + $article = $this->get_cached($url) + or $this->returnServerError('Could not request Nextgov: ' . $url); + + $contents = $article->find('div.wysiwyg', 0)->innertext; + $contents = $this->StripWithDelimiters($contents, '
', '
'); + $contents = $this->StripWithDelimiters($contents, ''); //ad outer div + return $this->StripWithDelimiters($contents, ''); + $contents = ($article_thumbnail == '' ? '' : '

') + .'

'.$article_subtitle.'

' + .trim($contents); } } diff --git a/bridges/NiceMatinBridge.php b/bridges/NiceMatinBridge.php index 3c189090..0f9d011a 100644 --- a/bridges/NiceMatinBridge.php +++ b/bridges/NiceMatinBridge.php @@ -1,13 +1,23 @@ collectExpandableDatas(self::URI . 'derniere-minute/rss'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->NiceMatinExtractContent($item['uri']); + return $item; + } + private function NiceMatinExtractContent($url) { - $html = $this->getSimpleHTMLDOM($url); + $html = $this->get_cached($url); if(!$html) return 'Could not acquire content from url: ' . $url . '!'; @@ -19,29 +29,4 @@ class NiceMatinBridge extends BridgeAbstract{ $text = strip_tags($text, '

'); return $text; } - - public function collectData(){ - $html = $this->getSimpleHTMLDOM(self::URI.'derniere-minute/rss') - or $this->returnServerError('Could not request NiceMatin.'); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit >= 10) { - break; - } - // We need to fix the 'link' tag as simplehtmldom cannot parse it (just rename it and load back as dom) - $element_text = $element->outertext; - $element_text = str_replace('', '', $element_text); - $element_text = str_replace('', '', $element_text); - $element = str_get_html($element_text); - - $item = array(); - $item['title'] = $element->find('title', 0)->innertext; - $item['uri'] = $element->find('url', 0)->innertext; - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->NiceMatinExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } } diff --git a/bridges/NumeramaBridge.php b/bridges/NumeramaBridge.php index 1e80affb..48260a09 100644 --- a/bridges/NumeramaBridge.php +++ b/bridges/NumeramaBridge.php @@ -1,5 +1,5 @@ collectExpandableDatas(self::URI . 'feed/'); + } - function NumeramaStripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->ExtractContent($item['uri']); + return $item; + } - $feed = self::URI.'feed/'; - $html = $this->getSimpleHTMLDOM($feed) or $this->returnServerError('Could not request Numerama: '.$feed); - $limit = 0; + private function ExtractContent($url){ + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); - foreach($html->find('item') as $element) { - if($limit < 5) { - $item = array(); - $item['title'] = html_entity_decode(NumeramaStripCDATA($element->find('title', 0)->innertext)); - $item['author'] = NumeramaStripCDATA($element->find('dc:creator', 0)->innertext); - $item['uri'] = NumeramaStripCDATA($element->find('guid', 0)->plaintext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - - $article_url = NumeramaStripCDATA($element->find('guid', 0)->plaintext); - if($this->get_cached_time($article_url) <= strtotime('-24 hours')) - $this->remove_from_cache($article_url); - - $article_html = $this->get_cached($article_url) or $this->returnServerError('Could not request Numerama: '.$article_url); - $contents = $article_html->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block - $contents = ''; // add post picture - $contents = $contents.$article_html->find('article[class=post-content]', 0)->innertext; // extract the post - - $item['content'] = $contents; - $this->items[] = $item; - $limit++; - } - } + $article_html = $this->get_cached($url) or $this->returnServerError('Could not request Numerama: '.$url); + $contents = $article_html->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block + $contents = ''; // add post picture + return $contents . $article_html->find('article[class=post-content]', 0)->innertext; // extract the post } public function getCacheDuration() { From f1fb527607a0cdf39f57e94fcec050cd2f01cb63 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Mon, 5 Sep 2016 20:17:00 +0200 Subject: [PATCH 2/3] [FeedExpander] Add optional parameter to specify max items Allows caller of collectExpandableDatas to request a limited amount of items --- lib/FeedExpander.php | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/lib/FeedExpander.php b/lib/FeedExpander.php index f164799e..abaf1210 100644 --- a/lib/FeedExpander.php +++ b/lib/FeedExpander.php @@ -6,7 +6,7 @@ abstract class FeedExpander extends HttpCachingBridgeAbstract { private $uri; private $description; - public function collectExpandableDatas($url){ + public function collectExpandableDatas($url, $maxItems = -1){ if(empty($url)){ $this->returnServerError('There is no $url for this RSS expander'); } @@ -25,43 +25,46 @@ abstract class FeedExpander extends HttpCachingBridgeAbstract { $this->debugMessage('Detected RSS format'); if(isset($rssContent->item[0])){ $this->debugMessage('Detected RSS 1.0 format'); - $this->collect_RSS_1_0_data($rssContent); + $this->collect_RSS_1_0_data($rssContent, $maxItems); } else { $this->debugMessage('Detected RSS 0.9x or 2.0 format'); - $this->collect_RSS_2_0_data($rssContent); + $this->collect_RSS_2_0_data($rssContent, $maxItems); } } elseif(isset($rssContent->entry[0])){ $this->debugMessage('Detected ATOM format'); - $this->collect_ATOM_data($rssContent); + $this->collect_ATOM_data($rssContent, $maxItems); } else { $this->debugMessage('Unknown feed format/version'); $this->returnServerError('The feed format is unknown!'); } } - protected function collect_RSS_1_0_data($rssContent){ + protected function collect_RSS_1_0_data($rssContent, $maxItems){ $this->load_RSS_2_0_feed_data($rssContent->channel[0]); foreach($rssContent->item as $item){ $this->debugMessage('parsing item ' . var_export($item, true)); $this->items[] = $this->parseItem($item); + if($maxItems !== -1 && count($this->items) >= $maxItems) break; } } - protected function collect_RSS_2_0_data($rssContent){ + protected function collect_RSS_2_0_data($rssContent, $maxItems){ $rssContent = $rssContent->channel[0]; $this->debugMessage('RSS content is ===========\n' . var_export($rssContent, true) . '==========='); $this->load_RSS_2_0_feed_data($rssContent); foreach($rssContent->item as $item){ $this->debugMessage('parsing item ' . var_export($item, true)); $this->items[] = $this->parseItem($item); + if($maxItems !== -1 && count($this->items) >= $maxItems) break; } } - protected function collect_ATOM_data($content){ + protected function collect_ATOM_data($content, $maxItems){ $this->load_ATOM_feed_data($content); foreach($content->entry as $item){ $this->debugMessage('parsing item ' . var_export($item, true)); $this->items[] = $this->parseItem($item); + if($maxItems !== -1 && count($this->items) >= $maxItems) break; } } From 2861a855e4641f1d072813fe60acef1edfe8c46f Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Mon, 5 Sep 2016 20:26:45 +0200 Subject: [PATCH 3/3] [bridges] Define max items and clear caches --- bridges/CADBridge.php | 4 +++- bridges/CommonDreamsBridge.php | 4 +++- bridges/DauphineLibereBridge.php | 6 ++++-- bridges/DeveloppezDotComBridge.php | 4 +++- bridges/FuturaSciencesBridge.php | 16 +++++++++------- bridges/LeJournalDuGeekBridge.php | 4 +++- bridges/LeMondeInformatiqueBridge.php | 16 +++++++++------- bridges/LichessBridge.php | 2 +- bridges/NextInpactBridge.php | 4 +++- bridges/NextgovBridge.php | 2 +- bridges/NiceMatinBridge.php | 4 +++- bridges/NumeramaBridge.php | 2 +- 12 files changed, 43 insertions(+), 25 deletions(-) diff --git a/bridges/CADBridge.php b/bridges/CADBridge.php index eb05fd16..1fdcfb5a 100644 --- a/bridges/CADBridge.php +++ b/bridges/CADBridge.php @@ -6,7 +6,7 @@ class CADBridge extends FeedExpander { const DESCRIPTION = "Returns the newest articles."; public function collectData(){ - $this->collectExpandableDatas('http://cdn2.cad-comic.com/rss.xml'); + $this->collectExpandableDatas('http://cdn2.cad-comic.com/rss.xml', 10); } protected function parseItem($newsItem){ @@ -16,6 +16,8 @@ class CADBridge extends FeedExpander { } private function CADExtractContent($url) { + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); $html3 = $this->get_cached($url); // The request might fail due to missing https support or wrong URL diff --git a/bridges/CommonDreamsBridge.php b/bridges/CommonDreamsBridge.php index e621db41..937590c6 100644 --- a/bridges/CommonDreamsBridge.php +++ b/bridges/CommonDreamsBridge.php @@ -7,7 +7,7 @@ class CommonDreamsBridge extends FeedExpander { const DESCRIPTION = "Returns the newest articles."; public function collectData(){ - $this->collectExpandableDatas('http://www.commondreams.org/rss.xml'); + $this->collectExpandableDatas('http://www.commondreams.org/rss.xml', 10); } protected function parseItem($newsItem){ @@ -17,6 +17,8 @@ class CommonDreamsBridge extends FeedExpander { } private function CommonDreamsExtractContent($url) { + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); $html3 = $this->get_cached($url); $text = $html3->find('div[class=field--type-text-with-summary]', 0)->innertext; $html3->clear(); diff --git a/bridges/DauphineLibereBridge.php b/bridges/DauphineLibereBridge.php index d8e10ddb..2f645c95 100644 --- a/bridges/DauphineLibereBridge.php +++ b/bridges/DauphineLibereBridge.php @@ -37,7 +37,7 @@ class DauphineLibereBridge extends FeedExpander { $url = self::URI . $this->getInput('u') . '/rss'; } - $this->collectExpandableDatas($url); + $this->collectExpandableDatas($url, 10); } protected function parseItem($newsItem){ @@ -47,7 +47,9 @@ class DauphineLibereBridge extends FeedExpander { } private function ExtractContent($url) { - $html2 = $this->getSimpleHTMLDOM($url); + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); + $html2 = $this->get_cached($url); $text = $html2->find('div.column', 0)->innertext; $text = preg_replace('@]*?>.*?@si', '', $text); return $text; diff --git a/bridges/DeveloppezDotComBridge.php b/bridges/DeveloppezDotComBridge.php index 52e52db1..cb277eca 100644 --- a/bridges/DeveloppezDotComBridge.php +++ b/bridges/DeveloppezDotComBridge.php @@ -7,7 +7,7 @@ class DeveloppezDotComBridge extends FeedExpander { const DESCRIPTION = "Returns the 15 newest posts from DeveloppezDotCom (full text)."; public function collectData(){ - $this->collectExpandableDatas(self::URI . 'index/rss'); + $this->collectExpandableDatas(self::URI . 'index/rss', 15); } protected function parseItem($newsItem){ @@ -42,6 +42,8 @@ class DeveloppezDotComBridge extends FeedExpander { } private function DeveloppezDotComExtractContent($url) { + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); $articleHTMLContent = $this->get_cached($url); $text = $this->convert_smart_quotes($articleHTMLContent->find('div.content', 0)->innertext); $text = utf8_encode($text); diff --git a/bridges/FuturaSciencesBridge.php b/bridges/FuturaSciencesBridge.php index beff9c8a..aef5813c 100644 --- a/bridges/FuturaSciencesBridge.php +++ b/bridges/FuturaSciencesBridge.php @@ -78,20 +78,22 @@ class FuturaSciencesBridge extends FeedExpander { ) )); - public function collectData(){ + public function collectData(){ $url = self::URI . 'rss/' . $this->getInput('feed') . '.xml'; - $this->collectExpandableDatas($url); - } + $this->collectExpandableDatas($url, 10); + } - protected function parseItem($newsItem){ - $item = $this->parseRSS_2_0_Item($newsItem); + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); $item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']); + if($this->get_cached_time($item['uri']) <= strtotime('-24 hours')) + $this->remove_from_cache($item['uri']); $article = $this->get_cached($item['uri']) or $this->returnServerError('Could not request Futura-Sciences: ' . $item['uri']); $item['content'] = $this->ExtractArticleContent($article); $item['author'] = empty($this->ExtractAuthor($article)) ? $item['author'] : $this->ExtractAuthor($article); - return $item; - } + return $item; + } function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { diff --git a/bridges/LeJournalDuGeekBridge.php b/bridges/LeJournalDuGeekBridge.php index dd0c444c..c537a159 100644 --- a/bridges/LeJournalDuGeekBridge.php +++ b/bridges/LeJournalDuGeekBridge.php @@ -7,7 +7,7 @@ class LeJournalDuGeekBridge extends FeedExpander { const DESCRIPTION = "Returns the 5 newest posts from LeJournalDuGeek (full text)."; public function collectData(){ - $this->collectExpandableDatas(self::URI . 'rss'); + $this->collectExpandableDatas(self::URI . 'rss', 5); } protected function parseItem($newsItem){ @@ -17,6 +17,8 @@ class LeJournalDuGeekBridge extends FeedExpander { } private function LeJournalDuGeekExtractContent($url) { + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); $articleHTMLContent = $this->get_cached($url); $text = $articleHTMLContent->find('div.post-content', 0)->innertext; diff --git a/bridges/LeMondeInformatiqueBridge.php b/bridges/LeMondeInformatiqueBridge.php index e361ea80..3b3e5b4c 100644 --- a/bridges/LeMondeInformatiqueBridge.php +++ b/bridges/LeMondeInformatiqueBridge.php @@ -6,18 +6,20 @@ class LeMondeInformatiqueBridge extends FeedExpander { const URI = "http://www.lemondeinformatique.fr/"; const DESCRIPTION = "Returns the newest articles."; - public function collectData(){ - $this->collectExpandableDatas(self::URI . 'rss/rss.xml'); - } + public function collectData(){ + $this->collectExpandableDatas(self::URI . 'rss/rss.xml', 10); + } - protected function parseItem($newsItem){ - $item = $this->parseRSS_1_0_Item($newsItem); + protected function parseItem($newsItem){ + $item = $this->parseRSS_1_0_Item($newsItem); + if($this->get_cached_time($item['uri']) <= strtotime('-24 hours')) + $this->remove_from_cache($item['uri']); $article_html = $this->get_cached($item['uri']) or $this->returnServerError('Could not request LeMondeInformatique: ' . $item['uri']); $item['content'] = $this->CleanArticle($article_html->find('div#article', 0)->innertext); $item['title'] = $article_html->find('h1.cleanprint-title', 0)->plaintext; - return $item; - } + return $item; + } function StripCDATA($string) { $string = str_replace('collectExpandableDatas(self::URI . '.atom'); + $this->collectExpandableDatas(self::URI . '.atom', 5); } protected function parseItem($newsItem){ diff --git a/bridges/NextInpactBridge.php b/bridges/NextInpactBridge.php index a24a02e4..a047f63a 100644 --- a/bridges/NextInpactBridge.php +++ b/bridges/NextInpactBridge.php @@ -7,7 +7,7 @@ class NextInpactBridge extends FeedExpander { const DESCRIPTION = "Returns the newest articles."; public function collectData(){ - $this->collectExpandableDatas(self::URI . 'rss/news.xml'); + $this->collectExpandableDatas(self::URI . 'rss/news.xml', 10); } protected function parseItem($newsItem){ @@ -17,6 +17,8 @@ class NextInpactBridge extends FeedExpander { } private function ExtractContent($url) { + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); $html2 = $this->get_cached($url); $text = '

'.$html2->find('span.sub_title', 0)->innertext.'

' .'

-

' diff --git a/bridges/NextgovBridge.php b/bridges/NextgovBridge.php index dee8c370..5d26ec5f 100644 --- a/bridges/NextgovBridge.php +++ b/bridges/NextgovBridge.php @@ -26,7 +26,7 @@ class NextgovBridge extends FeedExpander { )); public function collectData(){ - $this->collectExpandableDatas(self::URI . 'rss/' . $this->getInput('category') . '/'); + $this->collectExpandableDatas(self::URI . 'rss/' . $this->getInput('category') . '/', 10); } protected function parseItem($newsItem){ diff --git a/bridges/NiceMatinBridge.php b/bridges/NiceMatinBridge.php index 0f9d011a..9f0e5525 100644 --- a/bridges/NiceMatinBridge.php +++ b/bridges/NiceMatinBridge.php @@ -7,7 +7,7 @@ class NiceMatinBridge extends FeedExpander { const DESCRIPTION = "Returns the 10 newest posts from NiceMatin (full text)"; public function collectData(){ - $this->collectExpandableDatas(self::URI . 'derniere-minute/rss'); + $this->collectExpandableDatas(self::URI . 'derniere-minute/rss', 10); } protected function parseItem($newsItem){ @@ -17,6 +17,8 @@ class NiceMatinBridge extends FeedExpander { } private function NiceMatinExtractContent($url) { + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); $html = $this->get_cached($url); if(!$html) return 'Could not acquire content from url: ' . $url . '!'; diff --git a/bridges/NumeramaBridge.php b/bridges/NumeramaBridge.php index 48260a09..202d5528 100644 --- a/bridges/NumeramaBridge.php +++ b/bridges/NumeramaBridge.php @@ -7,7 +7,7 @@ class NumeramaBridge extends FeedExpander { const DESCRIPTION = 'Returns the 5 newest posts from Numerama (full text)'; public function collectData(){ - $this->collectExpandableDatas(self::URI . 'feed/'); + $this->collectExpandableDatas(self::URI . 'feed/', 5); } protected function parseItem($newsItem){