This commit is contained in:
logmanoriginal 2016-09-04 13:51:19 +02:00
commit 671703cd37
6 changed files with 225 additions and 126 deletions

View file

@ -1,6 +1,5 @@
<?php <?php
class AcrimedBridge extends RssExpander{ class AcrimedBridge extends FeedExpander {
const MAINTAINER = "qwertygc"; const MAINTAINER = "qwertygc";
const NAME = "Acrimed Bridge"; const NAME = "Acrimed Bridge";
@ -8,32 +7,19 @@ class AcrimedBridge extends RssExpander{
const DESCRIPTION = "Returns the newest articles."; const DESCRIPTION = "Returns the newest articles.";
public function collectData(){ public function collectData(){
$this->collectExpandableDatas("http://www.acrimed.org/spip.php?page=backend");
$this->collectExpandableDatas(static::URI.'spip.php?page=backend');
} }
protected function parseRSSItem($newsItem) { protected function parseItem($newsItem){
$item = $this->parseRSS_2_0_Item($newsItem);
$hs = new HTMLSanitizer(); $hs = new HTMLSanitizer();
$namespaces = $newsItem->getNameSpaces(true);
$dc = $newsItem->children($namespaces['dc']);
$item = array();
$item['uri'] = trim($newsItem->link);
$item['title'] = trim($newsItem->title);
$item['timestamp'] = strtotime($dc->date);
$articlePage = $this->getSimpleHTMLDOM($newsItem->link); $articlePage = $this->getSimpleHTMLDOM($newsItem->link);
$article = $hs->sanitize($articlePage->find('article.article1', 0)->innertext); $article = $hs->sanitize($articlePage->find('article.article1', 0)->innertext);
$article = HTMLSanitizer::defaultImageSrcTo($article, static::URI); $article = HTMLSanitizer::defaultImageSrcTo($article, "http://www.acrimed.org/");
$item['content'] = $article; $item['content'] = $article;
return $item; return $item;
} }
public function getCacheDuration(){ public function getCacheDuration(){

View file

@ -0,0 +1,62 @@
<?php
class FeedExpanderExampleBridge extends FeedExpander {
const MAINTAINER = 'logmanoriginal';
const NAME = 'FeedExpander Example';
const URI = '#';
const DESCRIPTION = 'Example bridge to test FeedExpander';
const PARAMETERS = array(
'Feed' => array(
'version' => array(
'name' => 'Version',
'type' => 'list',
'required' => true,
'title' => 'Select your feed format/version',
'defaultValue' => 'RSS 2.0',
'values' => array(
'RSS 0.91' => 'rss_0_9_1',
'RSS 1.0' => 'rss_1_0',
'RSS 2.0' => 'rss_2_0',
'ATOM 1.0' => 'atom_1_0'
)
)
)
);
public function collectData(){
switch($this->getInput('version')){
case 'rss_0_9_1':
parent::collectExpandableDatas('http://static.userland.com/gems/backend/sampleRss.xml');
break;
case 'rss_1_0':
parent::collectExpandableDatas('http://feeds.nature.com/nature/rss/current?format=xml');
break;
case 'rss_2_0':
parent::collectExpandableDatas('http://feeds.rssboard.org/rssboard?format=xml');
break;
case 'atom_1_0':
parent::collectExpandableDatas('http://segfault.linuxmint.com/feed/atom/');
break;
default: $this->returnClientError('Unknown version ' . $this->getInput('version') . '!');
}
}
protected function parseItem($newsItem) {
switch($this->getInput('version')){
case 'rss_0_9_1':
return $this->parseRSS_0_9_1_Item($newsItem);
break;
case 'rss_1_0':
return $this->parseRSS_1_0_Item($newsItem);
break;
case 'rss_2_0':
return $this->parseRSS_2_0_Item($newsItem);
break;
case 'atom_1_0':
return $this->parseATOMItem($newsItem);
break;
default: $this->returnClientError('Unknown version ' . $this->getInput('version') . '!');
}
}
}

View file

@ -1,6 +1,5 @@
<?php <?php
define("FREENEWS_RSS", 'http://feeds.feedburner.com/Freenews-Freebox?format=xml'); class FreenewsBridge extends FeedExpander {
class FreenewsBridge extends RssExpander {
const MAINTAINER = "mitsukarenai"; const MAINTAINER = "mitsukarenai";
const NAME = "Freenews"; const NAME = "Freenews";
@ -8,27 +7,16 @@ class FreenewsBridge extends RssExpander {
const DESCRIPTION = "Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox). Ne rentrez pas d'id si vous voulez accéder aux actualités générales."; const DESCRIPTION = "Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox). Ne rentrez pas d'id si vous voulez accéder aux actualités générales.";
public function collectData(){ public function collectData(){
parent::collectExpandableDatas(FREENEWS_RSS); parent::collectExpandableDatas('http://feeds.feedburner.com/Freenews-Freebox?format=xml');
} }
protected function parseRSSItem($newsItem) { protected function parseItem($newsItem) {
$item = array(); $item = $this->parseRSS_2_0_Item($newsItem);
$item['title'] = trim($newsItem->title);
$this->debugMessage("item has for title \"".$item['title']."\"");
if(empty($newsItem->guid)) {
$item['uri'] = (string) $newsItem->link;
} else {
$item['uri'] = (string) $newsItem->guid;
}
// now load that uri from cache
$this->debugMessage("now loading page ".$item['uri']);
$articlePage = $this->get_cached($item['uri']); $articlePage = $this->get_cached($item['uri']);
$content = $articlePage->find('.post-container', 0); $content = $articlePage->find('.post-container', 0);
$item['content'] = $content->innertext; $item['content'] = $content->innertext;
$item['author'] = $articlePage->find('a[rel=author]', 0)->innertext;
// format should parse 2014-03-25T16:21:20Z. But, according to http://stackoverflow.com/a/10478469, it is not that simple
$item['timestamp'] = $this->RSS_2_0_time_to_timestamp($newsItem);
return $item; return $item;
} }
} }

View file

@ -1,35 +1,19 @@
<?php <?php
class Les400CulsBridge extends RssExpander{ class Les400CulsBridge extends FeedExpander{
const MAINTAINER = "unknown"; const MAINTAINER = "unknown";
const NAME = "Les 400 Culs"; const NAME = "Les 400 Culs";
const URI = "http://sexes.blogs.liberation.fr/"; const URI = "http://sexes.blogs.liberation.fr/";
const DESCRIPTION = "La planete sexe vue par Agnes Girard via rss-bridge"; const DESCRIPTION = "La planete sexe vue par Agnes Girard via rss-bridge";
public function collectData(){ public function collectData(){
$this->collectExpandableDatas(self::URI . 'feeds/'); $this->collectExpandableDatas(self::URI . 'feeds/');
} }
protected function parseRSSItem($newsItem) { protected function parseItem($newsItem){
$item = array(); return $this->parseRSS_2_0_Item($newsItem);
$item['title'] = trim((string) $newsItem->title);
$this->debugMessage("browsing item ".var_export($newsItem, true));
if(empty($newsItem->guid)) {
$item['uri'] = (string) $newsItem->link;
} else {
$item['uri'] = (string) $newsItem->guid;
} }
// now load that uri from cache
$this->debugMessage("now loading page ".$item['uri']);
// $articlePage = $this->get_cached($item['uri']);
// $content = $articlePage->find('.post-container', 0);
$item['content'] = (string) $newsItem->description;
$item['author'] = (string) $newsItem->author;
$item['timestamp'] = $this->RSS_2_0_time_to_timestamp($newsItem);
return $item;
}
public function getCacheDuration(){ public function getCacheDuration(){
return 7200; // 2h hours return 7200; // 2h hours
} }

View file

@ -1,5 +1,5 @@
<?php <?php
class TheOatmealBridge extends RssExpander{ class TheOatmealBridge extends FeedExpander{
const MAINTAINER = "Riduidel"; const MAINTAINER = "Riduidel";
const NAME = "The Oatmeal"; const NAME = "The Oatmeal";
@ -10,44 +10,17 @@ class TheOatmealBridge extends RssExpander{
$this->collectExpandableDatas('http://feeds.feedburner.com/oatmealfeed'); $this->collectExpandableDatas('http://feeds.feedburner.com/oatmealfeed');
} }
protected function parseItem($newsItem) {
$item = $this->parseRSS_1_0_Item($newsItem);
/**
* Since the oatmeal produces a weird RSS feed, I have to fix it by loading the items separatly from the feed infos
*/
protected function collect_RSS_2_0_data($rssContent) {
$rssContent->registerXPathNamespace("dc", "http://purl.org/dc/elements/1.1/");
$rssHeaderContent = $rssContent->channel[0];
$this->debugMessage("RSS content is ===========\n".var_export($rssHeaderContent, true)."===========");
$this->load_RSS_2_0_feed_data($rssHeaderContent);
foreach($rssContent->item as $item) {
$this->debugMessage("parsing item ".var_export($item, true));
$this->items[] = $this->parseRSSItem($item);
}
}
protected function parseRSSItem($newsItem) {
$namespaces = $newsItem->getNameSpaces(true);
$dc = $newsItem->children($namespaces['dc']);
$rdf = $newsItem->children($namespaces['rdf']);
$item = array();
$item['title'] = trim($newsItem->title);
$this->debugMessage("browsing Oatmeal item ".var_export($newsItem, true));
$item['uri']=(string) $newsItem->attributes($namespaces['rdf'])->about;
// now load that uri from cache
$this->debugMessage("now loading page ".$item['uri']);
$articlePage = $this->get_cached($item['uri']); $articlePage = $this->get_cached($item['uri']);
$content = $articlePage->find('#comic', 0); $content = $articlePage->find('#comic', 0);
if($content==null) { if(is_null($content)) // load alternative
$content = $articlePage->find('#blog'); $content = $articlePage->find('#blog', 0);
}
if(!is_null($content))
$item['content'] = $content->innertext; $item['content'] = $content->innertext;
$this->debugMessage("dc content is ".var_export($dc, true));
$item['author'] = (string) $dc->creator;
$item['timestamp'] = DateTime::createFromFormat(DateTime::ISO8601, $dc->date)->getTimestamp();
$this->debugMessage("writtem by ".$item['author']." on ".$item['timestamp']);
return $item; return $item;
} }

View file

@ -585,30 +585,52 @@ abstract class HttpCachingBridgeAbstract extends BridgeAbstract {
} }
} }
abstract class RssExpander extends HttpCachingBridgeAbstract { abstract class FeedExpander extends HttpCachingBridgeAbstract {
private $name; private $name;
private $uri; private $uri;
private $description; private $description;
public function collectExpandableDatas($name){ public function collectExpandableDatas($url){
if(empty($name)){ if(empty($url)){
$this->returnServerError('There is no $name for this RSS expander'); $this->returnServerError('There is no $url for this RSS expander');
} }
$this->debugMessage('Loading from ' . $name); $this->debugMessage('Loading from ' . $url);
/* Notice we do not use cache here on purpose: /* Notice we do not use cache here on purpose:
* we want a fresh view of the RSS stream each time * we want a fresh view of the RSS stream each time
*/ */
$content = $this->getContents($name) or $this->returnServerError('Could not request ' . $name); $content = $this->getContents($url)
or $this->returnServerError('Could not request ' . $url);
$rssContent = simplexml_load_string($content); $rssContent = simplexml_load_string($content);
$this->debugMessage('loaded RSS from ' . $name);
// TODO insert RSS format detection $this->debugMessage('Detecting feed format/version');
// For now we always assume RSS 2.0 if(isset($rssContent->channel[0])){
$this->debugMessage('Detected RSS format');
if(isset($rssContent->item[0])){
$this->debugMessage('Detected RSS 1.0 format');
$this->collect_RSS_1_0_data($rssContent);
} else {
$this->debugMessage('Detected RSS 0.9x or 2.0 format');
$this->collect_RSS_2_0_data($rssContent); $this->collect_RSS_2_0_data($rssContent);
} }
} elseif(isset($rssContent->entry[0])){
$this->debugMessage('Detected ATOM format');
$this->collect_ATOM_data($rssContent);
} else {
$this->debugMessage('Unknown feed format/version');
$this->returnServerError('The feed format is unknown!');
}
}
protected function collect_RSS_1_0_data($rssContent){
$this->load_RSS_2_0_feed_data($rssContent->channel[0]);
foreach($rssContent->item as $item){
$this->debugMessage('parsing item ' . var_export($item, true));
$this->items[] = $this->parseItem($item);
}
}
protected function collect_RSS_2_0_data($rssContent){ protected function collect_RSS_2_0_data($rssContent){
$rssContent = $rssContent->channel[0]; $rssContent = $rssContent->channel[0];
@ -616,7 +638,15 @@ abstract class RssExpander extends HttpCachingBridgeAbstract {
$this->load_RSS_2_0_feed_data($rssContent); $this->load_RSS_2_0_feed_data($rssContent);
foreach($rssContent->item as $item){ foreach($rssContent->item as $item){
$this->debugMessage('parsing item ' . var_export($item, true)); $this->debugMessage('parsing item ' . var_export($item, true));
$this->items[] = $this->parseRSSItem($item); $this->items[] = $this->parseItem($item);
}
}
protected function collect_ATOM_data($content){
$this->load_ATOM_feed_data($content);
foreach($content->entry as $item){
$this->debugMessage('parsing item ' . var_export($item, true));
$this->items[] = $this->parseItem($item);
} }
} }
@ -631,12 +661,88 @@ abstract class RssExpander extends HttpCachingBridgeAbstract {
$this->description = trim($rssContent->description); $this->description = trim($rssContent->description);
} }
protected function load_ATOM_feed_data($content){
$this->name = $content->title;
// Find best link (only one, or first of 'alternate')
if(!isset($content->link)){
$this->uri = '';
} elseif (count($content->link) === 1){
$this->uri = $content->link[0]['href'];
} else {
$this->uri = '';
foreach($content->link as $link){
if(strtolower($link['rel']) === 'alternate'){
$this->uri = $link['href'];
break;
}
}
}
if(isset($content->subtitle))
$this->description = $content->subtitle;
}
protected function parseATOMItem($feedItem){
$item = array();
if(isset($feedItem->id)) $item['uri'] = $feedItem->id;
if(isset($feedItem->title)) $item['title'] = $feedItem->title;
if(isset($feedItem->updated)) $item['timestamp'] = strtotime($feedItem->updated);
if(isset($feedItem->author)) $item['author'] = $feedItem->author->name;
if(isset($feedItem->content)) $item['content'] = $feedItem->content;
return $item;
}
protected function parseRSS_0_9_1_Item($feedItem){
$item = array();
if(isset($feedItem->link)) $item['uri'] = $feedItem->link;
if(isset($feedItem->title)) $item['title'] = $feedItem->title;
// rss 0.91 doesn't support timestamps
// rss 0.91 doesn't support authors
if(isset($feedItem->description)) $item['content'] = $feedItem->description;
return $item;
}
protected function parseRSS_1_0_Item($feedItem){
// 1.0 adds optional elements around the 0.91 standard
$item = $this->parseRSS_0_9_1_Item($feedItem);
$namespaces = $feedItem->getNamespaces(true);
if(isset($namespaces['dc'])){
$dc = $feedItem->children($namespaces['dc']);
if(isset($dc->date)) $item['timestamp'] = strtotime($dc->date);
if(isset($dc->creator)) $item['author'] = $dc->creator;
}
return $item;
}
protected function parseRSS_2_0_Item($feedItem){
// Primary data is compatible to 0.91 with some additional data
$item = $this->parseRSS_0_9_1_Item($feedItem);
$namespaces = $feedItem->getNamespaces(true);
if(isset($namespaces['dc'])) $dc = $feedItem->children($namespaces['dc']);
if(isset($feedItem->pubDate)){
$item['timestamp'] = strtotime($feedItem->pubDate);
} elseif(isset($dc->date)){
$item['timestamp'] = strtotime($dc->date);
}
if(isset($feedItem->author)){
$item['author'] = $feedItem->author;
} elseif(isset($dc->creator)){
$item['author'] = $dc->creator;
}
return $item;
}
/** /**
* Method should return, from a source RSS item given by lastRSS, one of our Items objects * Method should return, from a source RSS item given by lastRSS, one of our Items objects
* @param $item the input rss item * @param $item the input rss item
* @return a RSS-Bridge Item, with (hopefully) the whole content) * @return a RSS-Bridge Item, with (hopefully) the whole content)
*/ */
abstract protected function parseRSSItem($item); abstract protected function parseItem($item);
public function getURI(){ public function getURI(){
return $this->uri; return $this->uri;