diff --git a/bridges/AcrimedBridge.php b/bridges/AcrimedBridge.php index 50a50304..54c6f873 100644 --- a/bridges/AcrimedBridge.php +++ b/bridges/AcrimedBridge.php @@ -1,40 +1,26 @@ collectExpandableDatas("http://www.acrimed.org/spip.php?page=backend"); + } - public function collectData(){ + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); - $this->collectExpandableDatas(static::URI.'spip.php?page=backend'); + $hs = new HTMLSanitizer(); + $articlePage = $this->getSimpleHTMLDOM($newsItem->link); + $article = $hs->sanitize($articlePage->find('article.article1', 0)->innertext); + $article = HTMLSanitizer::defaultImageSrcTo($article, "http://www.acrimed.org/"); + $item['content'] = $article; - } - - protected function parseRSSItem($newsItem) { - - $hs = new HTMLSanitizer(); - - $namespaces = $newsItem->getNameSpaces(true); - $dc = $newsItem->children($namespaces['dc']); - - $item = array(); - $item['uri'] = trim($newsItem->link); - $item['title'] = trim($newsItem->title); - $item['timestamp'] = strtotime($dc->date); - - $articlePage = $this->getSimpleHTMLDOM($newsItem->link); - $article = $hs->sanitize($articlePage->find('article.article1', 0)->innertext); - $article = HTMLSanitizer::defaultImageSrcTo($article, static::URI); - - $item['content'] = $article; - - - return $item; - - } + return $item; + } public function getCacheDuration(){ return 4800; // 2 hours diff --git a/bridges/FeedExpanderExampleBridge.php b/bridges/FeedExpanderExampleBridge.php new file mode 100644 index 00000000..c9badd68 --- /dev/null +++ b/bridges/FeedExpanderExampleBridge.php @@ -0,0 +1,62 @@ + array( + 'version' => array( + 'name' => 'Version', + 'type' => 'list', + 'required' => true, + 'title' => 'Select your feed format/version', + 'defaultValue' => 'RSS 2.0', + 'values' => array( + 'RSS 0.91' => 'rss_0_9_1', + 'RSS 1.0' => 'rss_1_0', + 'RSS 2.0' => 'rss_2_0', + 'ATOM 1.0' => 'atom_1_0' + ) + ) + ) + ); + + public function collectData(){ + switch($this->getInput('version')){ + case 'rss_0_9_1': + parent::collectExpandableDatas('http://static.userland.com/gems/backend/sampleRss.xml'); + break; + case 'rss_1_0': + parent::collectExpandableDatas('http://feeds.nature.com/nature/rss/current?format=xml'); + break; + case 'rss_2_0': + parent::collectExpandableDatas('http://feeds.rssboard.org/rssboard?format=xml'); + break; + case 'atom_1_0': + parent::collectExpandableDatas('http://segfault.linuxmint.com/feed/atom/'); + break; + default: $this->returnClientError('Unknown version ' . $this->getInput('version') . '!'); + } + } + + protected function parseItem($newsItem) { + switch($this->getInput('version')){ + case 'rss_0_9_1': + return $this->parseRSS_0_9_1_Item($newsItem); + break; + case 'rss_1_0': + return $this->parseRSS_1_0_Item($newsItem); + break; + case 'rss_2_0': + return $this->parseRSS_2_0_Item($newsItem); + break; + case 'atom_1_0': + return $this->parseATOMItem($newsItem); + break; + default: $this->returnClientError('Unknown version ' . $this->getInput('version') . '!'); + } + } +} diff --git a/bridges/FreenewsBridge.php b/bridges/FreenewsBridge.php index 60fd34e2..dbc46b9e 100644 --- a/bridges/FreenewsBridge.php +++ b/bridges/FreenewsBridge.php @@ -1,34 +1,22 @@ title); - $this->debugMessage("item has for title \"".$item['title']."\""); - if(empty($newsItem->guid)) { - $item['uri'] = (string) $newsItem->link; - } else { - $item['uri'] = (string) $newsItem->guid; - } - // now load that uri from cache - $this->debugMessage("now loading page ".$item['uri']); + protected function parseItem($newsItem) { + $item = $this->parseRSS_2_0_Item($newsItem); + $articlePage = $this->get_cached($item['uri']); - $content = $articlePage->find('.post-container', 0); $item['content'] = $content->innertext; - $item['author'] = $articlePage->find('a[rel=author]', 0)->innertext; - // format should parse 2014-03-25T16:21:20Z. But, according to http://stackoverflow.com/a/10478469, it is not that simple - $item['timestamp'] = $this->RSS_2_0_time_to_timestamp($newsItem); + return $item; } } diff --git a/bridges/Les400CulsBridge.php b/bridges/Les400CulsBridge.php index ce56537b..972909a3 100644 --- a/bridges/Les400CulsBridge.php +++ b/bridges/Les400CulsBridge.php @@ -1,35 +1,19 @@ collectExpandableDatas(self::URI.'feeds/'); + $this->collectExpandableDatas(self::URI . 'feeds/'); } - protected function parseRSSItem($newsItem) { - $item = array(); - $item['title'] = trim((string) $newsItem->title); - $this->debugMessage("browsing item ".var_export($newsItem, true)); - if(empty($newsItem->guid)) { - $item['uri'] = (string) $newsItem->link; - } else { - $item['uri'] = (string) $newsItem->guid; - } - // now load that uri from cache - $this->debugMessage("now loading page ".$item['uri']); -// $articlePage = $this->get_cached($item['uri']); - -// $content = $articlePage->find('.post-container', 0); - $item['content'] = (string) $newsItem->description; - $item['author'] = (string) $newsItem->author; - $item['timestamp'] = $this->RSS_2_0_time_to_timestamp($newsItem); - return $item; + protected function parseItem($newsItem){ + return $this->parseRSS_2_0_Item($newsItem); } + public function getCacheDuration(){ return 7200; // 2h hours } diff --git a/bridges/TheOatMealBridge.php b/bridges/TheOatMealBridge.php index dc6804aa..eee9283c 100644 --- a/bridges/TheOatMealBridge.php +++ b/bridges/TheOatMealBridge.php @@ -1,5 +1,5 @@ collectExpandableDatas('http://feeds.feedburner.com/oatmealfeed'); } + protected function parseItem($newsItem) { + $item = $this->parseRSS_1_0_Item($newsItem); - /** - * Since the oatmeal produces a weird RSS feed, I have to fix it by loading the items separatly from the feed infos - */ - protected function collect_RSS_2_0_data($rssContent) { - $rssContent->registerXPathNamespace("dc", "http://purl.org/dc/elements/1.1/"); - $rssHeaderContent = $rssContent->channel[0]; - $this->debugMessage("RSS content is ===========\n".var_export($rssHeaderContent, true)."==========="); - $this->load_RSS_2_0_feed_data($rssHeaderContent); - foreach($rssContent->item as $item) { - $this->debugMessage("parsing item ".var_export($item, true)); - $this->items[] = $this->parseRSSItem($item); - } - } - - - protected function parseRSSItem($newsItem) { - $namespaces = $newsItem->getNameSpaces(true); - $dc = $newsItem->children($namespaces['dc']); - $rdf = $newsItem->children($namespaces['rdf']); - $item = array(); - $item['title'] = trim($newsItem->title); - $this->debugMessage("browsing Oatmeal item ".var_export($newsItem, true)); - $item['uri']=(string) $newsItem->attributes($namespaces['rdf'])->about; - // now load that uri from cache - $this->debugMessage("now loading page ".$item['uri']); $articlePage = $this->get_cached($item['uri']); - $content = $articlePage->find('#comic', 0); - if($content==null) { - $content = $articlePage->find('#blog'); - } - $item['content'] = $content->innertext; + if(is_null($content)) // load alternative + $content = $articlePage->find('#blog', 0); + + if(!is_null($content)) + $item['content'] = $content->innertext; - $this->debugMessage("dc content is ".var_export($dc, true)); - $item['author'] = (string) $dc->creator; - $item['timestamp'] = DateTime::createFromFormat(DateTime::ISO8601, $dc->date)->getTimestamp(); - $this->debugMessage("writtem by ".$item['author']." on ".$item['timestamp']); return $item; } diff --git a/lib/Bridge.php b/lib/Bridge.php index 2a83630f..72734df1 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -585,29 +585,51 @@ abstract class HttpCachingBridgeAbstract extends BridgeAbstract { } } -abstract class RssExpander extends HttpCachingBridgeAbstract { +abstract class FeedExpander extends HttpCachingBridgeAbstract { private $name; private $uri; private $description; - public function collectExpandableDatas($name){ - if(empty($name)){ - $this->returnServerError('There is no $name for this RSS expander'); + public function collectExpandableDatas($url){ + if(empty($url)){ + $this->returnServerError('There is no $url for this RSS expander'); } - $this->debugMessage('Loading from ' . $name); + $this->debugMessage('Loading from ' . $url); /* Notice we do not use cache here on purpose: * we want a fresh view of the RSS stream each time */ - $content = $this->getContents($name) or $this->returnServerError('Could not request ' . $name); - + $content = $this->getContents($url) + or $this->returnServerError('Could not request ' . $url); $rssContent = simplexml_load_string($content); - $this->debugMessage('loaded RSS from ' . $name); - // TODO insert RSS format detection - // For now we always assume RSS 2.0 - $this->collect_RSS_2_0_data($rssContent); + + $this->debugMessage('Detecting feed format/version'); + if(isset($rssContent->channel[0])){ + $this->debugMessage('Detected RSS format'); + if(isset($rssContent->item[0])){ + $this->debugMessage('Detected RSS 1.0 format'); + $this->collect_RSS_1_0_data($rssContent); + } else { + $this->debugMessage('Detected RSS 0.9x or 2.0 format'); + $this->collect_RSS_2_0_data($rssContent); + } + } elseif(isset($rssContent->entry[0])){ + $this->debugMessage('Detected ATOM format'); + $this->collect_ATOM_data($rssContent); + } else { + $this->debugMessage('Unknown feed format/version'); + $this->returnServerError('The feed format is unknown!'); + } + } + + protected function collect_RSS_1_0_data($rssContent){ + $this->load_RSS_2_0_feed_data($rssContent->channel[0]); + foreach($rssContent->item as $item){ + $this->debugMessage('parsing item ' . var_export($item, true)); + $this->items[] = $this->parseItem($item); + } } protected function collect_RSS_2_0_data($rssContent){ @@ -616,7 +638,15 @@ abstract class RssExpander extends HttpCachingBridgeAbstract { $this->load_RSS_2_0_feed_data($rssContent); foreach($rssContent->item as $item){ $this->debugMessage('parsing item ' . var_export($item, true)); - $this->items[] = $this->parseRSSItem($item); + $this->items[] = $this->parseItem($item); + } + } + + protected function collect_ATOM_data($content){ + $this->load_ATOM_feed_data($content); + foreach($content->entry as $item){ + $this->debugMessage('parsing item ' . var_export($item, true)); + $this->items[] = $this->parseItem($item); } } @@ -631,12 +661,88 @@ abstract class RssExpander extends HttpCachingBridgeAbstract { $this->description = trim($rssContent->description); } + protected function load_ATOM_feed_data($content){ + $this->name = $content->title; + + // Find best link (only one, or first of 'alternate') + if(!isset($content->link)){ + $this->uri = ''; + } elseif (count($content->link) === 1){ + $this->uri = $content->link[0]['href']; + } else { + $this->uri = ''; + foreach($content->link as $link){ + if(strtolower($link['rel']) === 'alternate'){ + $this->uri = $link['href']; + break; + } + } + } + + if(isset($content->subtitle)) + $this->description = $content->subtitle; + } + + protected function parseATOMItem($feedItem){ + $item = array(); + if(isset($feedItem->id)) $item['uri'] = $feedItem->id; + if(isset($feedItem->title)) $item['title'] = $feedItem->title; + if(isset($feedItem->updated)) $item['timestamp'] = strtotime($feedItem->updated); + if(isset($feedItem->author)) $item['author'] = $feedItem->author->name; + if(isset($feedItem->content)) $item['content'] = $feedItem->content; + return $item; + } + + protected function parseRSS_0_9_1_Item($feedItem){ + $item = array(); + if(isset($feedItem->link)) $item['uri'] = $feedItem->link; + if(isset($feedItem->title)) $item['title'] = $feedItem->title; + // rss 0.91 doesn't support timestamps + // rss 0.91 doesn't support authors + if(isset($feedItem->description)) $item['content'] = $feedItem->description; + return $item; + } + + protected function parseRSS_1_0_Item($feedItem){ + // 1.0 adds optional elements around the 0.91 standard + $item = $this->parseRSS_0_9_1_Item($feedItem); + + $namespaces = $feedItem->getNamespaces(true); + if(isset($namespaces['dc'])){ + $dc = $feedItem->children($namespaces['dc']); + if(isset($dc->date)) $item['timestamp'] = strtotime($dc->date); + if(isset($dc->creator)) $item['author'] = $dc->creator; + } + + return $item; + } + + protected function parseRSS_2_0_Item($feedItem){ + // Primary data is compatible to 0.91 with some additional data + $item = $this->parseRSS_0_9_1_Item($feedItem); + + $namespaces = $feedItem->getNamespaces(true); + if(isset($namespaces['dc'])) $dc = $feedItem->children($namespaces['dc']); + + if(isset($feedItem->pubDate)){ + $item['timestamp'] = strtotime($feedItem->pubDate); + } elseif(isset($dc->date)){ + $item['timestamp'] = strtotime($dc->date); + } + if(isset($feedItem->author)){ + $item['author'] = $feedItem->author; + } elseif(isset($dc->creator)){ + $item['author'] = $dc->creator; + } + return $item; + } + /** * Method should return, from a source RSS item given by lastRSS, one of our Items objects * @param $item the input rss item * @return a RSS-Bridge Item, with (hopefully) the whole content) */ - abstract protected function parseRSSItem($item); + abstract protected function parseItem($item); public function getURI(){ return $this->uri;