[bridges] make them FeedExpander
Signed-off-by: Pierre Mazière <pierre.maziere@gmx.com>
This commit is contained in:
parent
244516f0a2
commit
8f58c9f86b
7 changed files with 190 additions and 306 deletions
|
@ -1,62 +1,26 @@
|
|||
<?php
|
||||
#ini_set('display_errors', 'On');
|
||||
#error_reporting(E_ALL);
|
||||
class ArstechnicaBridge extends BridgeAbstract {
|
||||
class ArstechnicaBridge extends FeedExpander {
|
||||
|
||||
const MAINTAINER = "prysme";
|
||||
const NAME = "ArstechnicaBridge";
|
||||
const URI = "http://arstechnica.com";
|
||||
const DESCRIPTION = "The PC enthusiast's resource. Power users and the tools they love, without computing religion";
|
||||
|
||||
function StripWithDelimiters($string, $start, $end) {
|
||||
while (strpos($string, $start) !== false) {
|
||||
$section_to_remove = substr($string, strpos($string, $start));
|
||||
$section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
|
||||
$string = str_replace($section_to_remove, '', $string);
|
||||
} return $string;
|
||||
protected function parseItem($item){
|
||||
$item = parent::parseItem($item);
|
||||
|
||||
$html = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||
if(!$html){
|
||||
$item['content'] .= '<p>Requesting full article failed.</p>';
|
||||
}else{
|
||||
$item['content'] = $html->find('.article-guts', 0);
|
||||
}
|
||||
|
||||
function StripCDATA($string) {
|
||||
$string = str_replace('<![CDATA[', '', $string);
|
||||
$string = str_replace(']]>', '', $string);
|
||||
return $string;
|
||||
}
|
||||
|
||||
function ExtractContent($url) {
|
||||
#echo $url;
|
||||
$html2 = $this->getSimpleHTMLDOM($url);
|
||||
|
||||
$text = $html2->find("section[id='article-guts']", 0);
|
||||
/*foreach ($text->find('<aside id="social-left">') as $node)
|
||||
{ $node = NULL; }*/
|
||||
$text = $this->StripWithDelimiters($text->innertext,'<aside id="social-left">','</aside>');
|
||||
$text = $this->StripWithDelimiters($text,'<figcaption class="caption">','</figcaption>');
|
||||
$text = $this->StripWithDelimiters($text,'<div class="gallery shortcode-gallery">','</div>');
|
||||
//error_log("ICI", 0);
|
||||
//error_log($text, 0);
|
||||
|
||||
return $text;
|
||||
return $item;
|
||||
}
|
||||
|
||||
public function collectData(){
|
||||
|
||||
$html = $this->getSimpleHTMLDOM('http://feeds.arstechnica.com/arstechnica/index') or $this->returnServerError('Could not request NextInpact.');
|
||||
$limit = 0;
|
||||
|
||||
foreach($html->find('item') as $element) {
|
||||
if($limit < 5) {
|
||||
$item = array();
|
||||
$item['title'] = $this->StripCDATA($element->find('title', 0)->innertext);
|
||||
$item['uri'] = $this->StripCDATA($element->find('guid', 0)->plaintext);
|
||||
$item['author'] = $this->StripCDATA($element->find('author', 0)->innertext);
|
||||
$item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext);
|
||||
$item['content'] = $this->ExtractContent($item['uri']);
|
||||
//$item['content'] = $item['uri'];
|
||||
$this->items[] = $item;
|
||||
$limit++;
|
||||
}
|
||||
}
|
||||
|
||||
$this->collectExpandableDatas('http://feeds.arstechnica.com/arstechnica/index/');
|
||||
}
|
||||
|
||||
public function getCacheDuration() {
|
||||
|
|
|
@ -1,38 +1,22 @@
|
|||
<?php
|
||||
class KoreusBridge extends BridgeAbstract{
|
||||
class KoreusBridge extends FeedExpander {
|
||||
|
||||
const MAINTAINER = "pit-fgfjiudghdf";
|
||||
const NAME = "Koreus";
|
||||
const URI = "http://www.koreus.com/";
|
||||
const DESCRIPTION = "Returns the 5 newest posts from Koreus (full text)";
|
||||
const DESCRIPTION = "Returns the newest posts from Koreus (full text)";
|
||||
|
||||
private function KoreusStripCDATA($string) {
|
||||
$string = str_replace('<![CDATA[', '', $string);
|
||||
$string = str_replace(']]>', '', $string);
|
||||
return $string;
|
||||
}
|
||||
protected function parseItem($item) {
|
||||
$item = parent::parseItem($item);
|
||||
|
||||
private function KoreusExtractContent($url) {
|
||||
$html2 = $this->getSimpleHTMLDOM($url);
|
||||
$text = $html2->find('p[class=itemText]', 0)->innertext;
|
||||
$text = utf8_encode(preg_replace('/(Sur le m.+?)+$/i','',$text));
|
||||
return $text;
|
||||
$html = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||
$text = $html->find('p.itemText', 0)->innertext;
|
||||
$item['content'] = utf8_encode($text);
|
||||
|
||||
return $item;
|
||||
}
|
||||
|
||||
public function collectData(){
|
||||
$html = $this->getSimpleHTMLDOM('http://feeds.feedburner.com/Koreus-articles') or $this->returnServerError('Could not request Koreus.');
|
||||
$limit = 0;
|
||||
|
||||
foreach($html->find('item') as $element) {
|
||||
if($limit < 5) {
|
||||
$item = array();
|
||||
$item['title'] = $this->KoreusStripCDATA($element->find('title', 0)->innertext);
|
||||
$item['uri'] = $this->KoreusStripCDATA($element->find('guid', 0)->plaintext);
|
||||
$item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext);
|
||||
$item['content'] = $this->KoreusExtractContent($item['uri']);
|
||||
$this->items[] = $item;
|
||||
$limit++;
|
||||
}
|
||||
}
|
||||
$this->collectExpandableDatas('http://feeds.feedburner.com/Koreus-articles');
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
<?php
|
||||
class NakedSecurityBridge extends BridgeAbstract {
|
||||
class NakedSecurityBridge extends FeedExpander {
|
||||
|
||||
const MAINTAINER = 'ORelio';
|
||||
const NAME = 'Naked Security';
|
||||
const URI = 'https://nakedsecurity.sophos.com/';
|
||||
const DESCRIPTION = 'Returns the newest articles.';
|
||||
|
||||
public function collectData(){
|
||||
|
||||
function StripRecursiveHTMLSection($string, $tag_name, $tag_start) {
|
||||
private function StripRecursiveHTMLSection($string, $tag_name, $tag_start) {
|
||||
$open_tag = '<'.$tag_name;
|
||||
$close_tag = '</'.$tag_name.'>';
|
||||
$close_tag_length = strlen($close_tag);
|
||||
|
@ -32,37 +30,32 @@ class NakedSecurityBridge extends BridgeAbstract {
|
|||
return $string;
|
||||
}
|
||||
|
||||
$feedUrl = 'https://feeds.feedburner.com/nakedsecurity?format=xml';
|
||||
$html = $this->getSimpleHTMLDOM($feedUrl) or $this->returnServerError('Could not request '.$this->getName().': '.$feedUrl);
|
||||
$limit = 0;
|
||||
|
||||
foreach ($html->find('item') as $element) {
|
||||
if ($limit < 10) {
|
||||
protected function parseItem($item){
|
||||
$item = parent::parseItem($item);
|
||||
|
||||
//Retrieve article Uri and get that page
|
||||
$article_uri = $element->find('guid', 0)->plaintext;
|
||||
$article_html = $this->getSimpleHTMLDOM($article_uri) or $this->returnServerError('Could not request '.$this->getName().': '.$article_uri);
|
||||
$article_html = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||
if(!$article_html){
|
||||
$item['content'] = 'Could not request '.$this->getName().': '.$item['uri'];
|
||||
return $item;
|
||||
}
|
||||
|
||||
//Build article contents from corresponding elements
|
||||
$article_title = trim($element->find('title', 0)->plaintext);
|
||||
$article_image = $article_html->find('img.wp-post-image', 0)->src;
|
||||
$article_summary = strip_tags(html_entity_decode($element->find('description', 0)->plaintext));
|
||||
$article_content = $article_html->find('div.entry-content', 0)->innertext;
|
||||
$article_content = StripRecursiveHTMLSection($article_content , 'div', '<div class="entry-prefix"');
|
||||
$article_content = StripRecursiveHTMLSection($article_content , 'script', '<script');
|
||||
$article_content = StripRecursiveHTMLSection($article_content , 'aside', '<aside');
|
||||
$article_content = '<p><img src="'.$article_image.'" /></p><p><b>'.$article_summary.'</b></p>'.$article_content;
|
||||
$article_content = $this->StripRecursiveHTMLSection($article_content , 'div', '<div class="entry-prefix"');
|
||||
$article_content = $this->StripRecursiveHTMLSection($article_content , 'script', '<script');
|
||||
$article_content = $this->StripRecursiveHTMLSection($article_content , 'aside', '<aside');
|
||||
$article_content = '<p><img src="'.$article_image.'" /></p><p><b>'.$item['content'].'</b></p>'.$article_content;
|
||||
|
||||
//Build and add final item
|
||||
$item = array();
|
||||
$item['uri'] = $article_uri;
|
||||
$item['title'] = $article_title;
|
||||
$item['author'] = $article_html->find('a[rel=author]', 0)->plaintext;
|
||||
$item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext);
|
||||
$item['content'] = $article_content;
|
||||
$this->items[] = $item;
|
||||
$limit++;
|
||||
}
|
||||
}
|
||||
|
||||
return $item;
|
||||
|
||||
}
|
||||
|
||||
public function collectData(){
|
||||
|
||||
$feedUrl = 'https://feeds.feedburner.com/nakedsecurity?format=xml';
|
||||
$this->collectExpandableDatas($feedUrl);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
<?php
|
||||
class NeuviemeArtBridge extends BridgeAbstract {
|
||||
class NeuviemeArtBridge extends FeedExpander {
|
||||
|
||||
const MAINTAINER = "ORelio";
|
||||
const NAME = '9ème Art Bridge';
|
||||
const URI = "http://www.9emeart.fr/";
|
||||
const DESCRIPTION = "Returns the newest articles.";
|
||||
|
||||
public function collectData(){
|
||||
|
||||
function StripWithDelimiters($string, $start, $end) {
|
||||
private function StripWithDelimiters($string, $start, $end) {
|
||||
while (strpos($string, $start) !== false) {
|
||||
$section_to_remove = substr($string, strpos($string, $start));
|
||||
$section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
|
||||
|
@ -16,39 +14,42 @@ class NeuviemeArtBridge extends BridgeAbstract {
|
|||
} return $string;
|
||||
}
|
||||
|
||||
$feedUrl = self::URI.'9emeart.rss';
|
||||
$html = $this->getSimpleHTMLDOM($feedUrl) or $this->returnServerError('Could not request 9eme Art: '.$feedUrl);
|
||||
$limit = 0;
|
||||
protected function parseItem($item){
|
||||
$item = parent::parseItem($item);
|
||||
|
||||
foreach ($html->find('item') as $element) {
|
||||
if ($limit < 5) {
|
||||
$article_html = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||
if(!$article_html){
|
||||
$item['content'] = 'Could not request 9eme Art: '.$item['uri'];
|
||||
return $item;
|
||||
}
|
||||
|
||||
//Retrieve article Uri and get that page
|
||||
$article_uri = $element->find('guid', 0)->plaintext;
|
||||
$article_html = $this->getSimpleHTMLDOM($article_uri) or $this->returnServerError('Could not request 9eme Art: '.$article_uri);
|
||||
|
||||
//Build article contents from corresponding elements
|
||||
$article_title = trim($element->find('title', 0)->plaintext);
|
||||
$article_image = $element->find('enclosure', 0)->url;
|
||||
foreach ($article_html->find('img.img_full') as $img)
|
||||
if ($img->alt == $article_title)
|
||||
$article_image = '';
|
||||
foreach ($article_html->find('img.img_full') as $img){
|
||||
if ($img->alt == $item['title']){
|
||||
$article_image = self::URI.$img->src;
|
||||
$article_content = '<p><img src="'.$article_image.'" /></p>'
|
||||
.str_replace('src="/', 'src="'.self::URI, $article_html->find('div.newsGenerique_con', 0)->innertext);
|
||||
$article_content = StripWithDelimiters($article_content, '<script', '</script>');
|
||||
$article_content = StripWithDelimiters($article_content, '<style', '</style>');
|
||||
$article_content = StripWithDelimiters($article_content, '<link', '>');
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$article_content='';
|
||||
if($article_image){
|
||||
$article_content = '<p><img src="'.$article_image.'" /></p>';
|
||||
}
|
||||
$article_content .= str_replace(
|
||||
'src="/', 'src="'.self::URI,
|
||||
$article_html->find('div.newsGenerique_con', 0)->innertext
|
||||
);
|
||||
$article_content = $this->StripWithDelimiters($article_content, '<script', '</script>');
|
||||
$article_content = $this->StripWithDelimiters($article_content, '<style', '</style>');
|
||||
$article_content = $this->StripWithDelimiters($article_content, '<link', '>');
|
||||
|
||||
//Build and add final item
|
||||
$item = array();
|
||||
$item['uri'] = $article_uri;
|
||||
$item['title'] = $article_title;
|
||||
$item['author'] = $article_html->find('a[class=upp transition_fast upp]', 0)->plaintext;
|
||||
$item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext);
|
||||
$item['content'] = $article_content;
|
||||
$this->items[] = $item;
|
||||
$limit++;
|
||||
}
|
||||
}
|
||||
|
||||
return $item;
|
||||
}
|
||||
|
||||
public function collectData(){
|
||||
$feedUrl = self::URI.'9emeart.rss';
|
||||
$this->collectExpandableDatas($feedUrl);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,42 +1,22 @@
|
|||
<?php
|
||||
class Rue89Bridge extends BridgeAbstract{
|
||||
class Rue89Bridge extends FeedExpander {
|
||||
|
||||
const MAINTAINER = "pit-fgfjiudghdf";
|
||||
const NAME = "Rue89";
|
||||
const URI = "http://rue89.nouvelobs.com/";
|
||||
const DESCRIPTION = "Returns the 5 newest posts from Rue89 (full text)";
|
||||
|
||||
private function rue89getDatas($url){
|
||||
protected function parseItem($item){
|
||||
$item = parent::parseItem($item);
|
||||
|
||||
$url = "http://api.rue89.nouvelobs.com/export/mobile2/node/" . str_replace(" ", "", substr($url, -8)) . "/full";
|
||||
$url = "http://api.rue89.nouvelobs.com/export/mobile2/node/" . str_replace(" ", "", substr($item['uri'], -8)) . "/full";
|
||||
$datas = json_decode($this->getContents($url), true);
|
||||
$item['content'] = $datas['node']['body'];
|
||||
|
||||
return $datas["node"];
|
||||
|
||||
return $item;
|
||||
}
|
||||
|
||||
public function collectData(){
|
||||
|
||||
$html = $this->getSimpleHTMLDOM('http://api.rue89.nouvelobs.com/feed') or $this->returnServerError('Could not request Rue89.');
|
||||
|
||||
$limit = 0;
|
||||
foreach($html->find('item') as $element) {
|
||||
|
||||
if($limit < 5) {
|
||||
|
||||
$datas = $this->rue89getDatas(str_replace('#commentaires', '', ($element->find('comments', 0)->plaintext)));
|
||||
|
||||
$item = array();
|
||||
$item['title'] = $datas["title"];
|
||||
$item['author'] = $datas["author"][0]["name"];
|
||||
$item['timestamp'] = $datas["updated"];
|
||||
$item['content'] = $datas["body"];
|
||||
$item['uri'] = $datas["url"];
|
||||
|
||||
$this->items[] = $item;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
$this->collectExpandableDatas('http://api.rue89.nouvelobs.com/feed');
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,38 +1,21 @@
|
|||
<?php
|
||||
class SiliconBridge extends BridgeAbstract {
|
||||
class SiliconBridge extends FeedExpander {
|
||||
|
||||
const MAINTAINER = "ORelio";
|
||||
const NAME = 'Silicon Bridge';
|
||||
const URI = 'http://www.silicon.fr/';
|
||||
const DESCRIPTION = "Returns the newest articles.";
|
||||
|
||||
public function collectData(){
|
||||
protected function parseItem($item){
|
||||
$item = parent::parseItem($item);
|
||||
|
||||
function StripCDATA($string) {
|
||||
$string = str_replace('<![CDATA[', '', $string);
|
||||
$string = str_replace(']]>', '', $string);
|
||||
return $string;
|
||||
$article_html = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||
if(!$article_html){
|
||||
$item['content'] .= '<p>Could not request Silicon: '.$item['uri'].'</p>';
|
||||
return $item;
|
||||
}
|
||||
|
||||
$feedUrl = self::URI.'feed';
|
||||
$html = $this->getSimpleHTMLDOM($feedUrl)
|
||||
or $this->returnServerError('Could not request Silicon: '.$feedUrl);
|
||||
$limit = 0;
|
||||
|
||||
foreach($html->find('item') as $element) {
|
||||
if($limit < 5) {
|
||||
|
||||
//Retrieve article Uri and get that page
|
||||
$article_uri = $element->innertext;
|
||||
$article_uri = substr($article_uri, strpos($article_uri, '<link>') + 6);
|
||||
$article_uri = substr($article_uri, 0, strpos($article_uri, '</link>'));
|
||||
$article_html = $this->getSimpleHTMLDOM($article_uri)
|
||||
or $this->returnServerError('Could not request Silicon: '.$article_uri);
|
||||
|
||||
//Build article contents from corresponding elements
|
||||
$thumbnailUri = $element->find('enclosure', 0)->url;
|
||||
$article_content = '<p><img src="'.$thumbnailUri.'" /></p>'
|
||||
.'<p><b>'.$article_html->find('div.entry-excerpt', 0)->plaintext.'</b></p>'
|
||||
$article_content = '<p><b>'.$article_html->find('div.entry-excerpt', 0)->plaintext.'</b></p>'
|
||||
.$article_html->find('div.entry-content', 0)->innertext;
|
||||
|
||||
//Remove useless scripts left in the page
|
||||
|
@ -42,17 +25,14 @@ class SiliconBridge extends BridgeAbstract {
|
|||
$article_content = str_replace($script_section, '', $article_content);
|
||||
}
|
||||
|
||||
//Build and add final item
|
||||
$item = array();
|
||||
$item['uri'] = $article_uri;
|
||||
$item['title'] = StripCDATA($element->find('title', 0)->innertext);
|
||||
$item['author'] = StripCDATA($element->find('dc:creator', 0)->innertext);
|
||||
$item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext);
|
||||
$item['content'] = $article_content;
|
||||
$this->items[] = $item;
|
||||
$limit++;
|
||||
}
|
||||
|
||||
return $item;
|
||||
}
|
||||
|
||||
public function collectData(){
|
||||
$feedUrl = self::URI.'feed';
|
||||
$this->collectExpandableDatas($feedUrl);
|
||||
}
|
||||
|
||||
public function getCacheDuration() {
|
||||
|
|
|
@ -1,22 +1,12 @@
|
|||
<?php
|
||||
class WeLiveSecurityBridge extends BridgeAbstract {
|
||||
class WeLiveSecurityBridge extends FeedExpander {
|
||||
|
||||
const MAINTAINER = 'ORelio';
|
||||
const NAME = 'We Live Security';
|
||||
const URI = 'http://www.welivesecurity.com/';
|
||||
const DESCRIPTION = 'Returns the newest articles.';
|
||||
|
||||
public function collectData(){
|
||||
|
||||
function ExtractFromDelimiters($string, $start, $end) {
|
||||
if (strpos($string, $start) !== false) {
|
||||
$section_retrieved = substr($string, strpos($string, $start) + strlen($start));
|
||||
$section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end));
|
||||
return $section_retrieved;
|
||||
} return false;
|
||||
}
|
||||
|
||||
function StripWithDelimiters($string, $start, $end) {
|
||||
private function StripWithDelimiters($string, $start, $end) {
|
||||
while (strpos($string, $start) !== false) {
|
||||
$section_to_remove = substr($string, strpos($string, $start));
|
||||
$section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
|
||||
|
@ -24,36 +14,28 @@ class WeLiveSecurityBridge extends BridgeAbstract {
|
|||
} return $string;
|
||||
}
|
||||
|
||||
$feed = $this->getURI().'feed/';
|
||||
$html = $this->getSimpleHTMLDOM($feed) or $this->returnServerError('Could not request '.$this->getName().': '.$feed);
|
||||
$limit = 0;
|
||||
|
||||
foreach ($html->find('item') as $element) {
|
||||
if ($limit < 5) {
|
||||
protected function parseItem($item){
|
||||
$item = parent::parseItem($item);
|
||||
|
||||
$article_html = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||
if(!$article_html){
|
||||
$item['content'] .= '<p>Could not request '.$this->getName().': '.$item['uri'].'</p>';
|
||||
return $item;
|
||||
}
|
||||
|
||||
$article_image = $element->find('image', 0)->plaintext;
|
||||
$article_url = ExtractFromDelimiters($element->innertext, '<link>', '</link>');
|
||||
$article_summary = ExtractFromDelimiters($element->innertext, '<description><![CDATA[<p>', '</p>');
|
||||
$article_html = $this->getContents($article_url) or $this->returnServerError('Could not request '.$this->getName().': '.$article_url);
|
||||
if (substr($article_html, 0, 2) == "\x1f\x8b") //http://www.gzip.org/zlib/rfc-gzip.html#header-trailer -> GZip ID1
|
||||
$article_html = gzdecode($article_html); //Response is GZipped even if we didn't accept GZip!? Let's decompress...
|
||||
$article_html = str_get_html($article_html); //Now we have our HTML data. But still, that's an important HTTP violation...
|
||||
$article_content = $article_html->find('div.wlistingsingletext', 0)->innertext;
|
||||
$article_content = StripWithDelimiters($article_content, '<script', '</script>');
|
||||
$article_content = '<p><img src="'.$article_image.'" /></p>'
|
||||
.'<p><b>'.$article_summary.'</b></p>'
|
||||
$article_content = $this->StripWithDelimiters($article_content, '<script', '</script>');
|
||||
$article_content = '<p><b>'.$item['content'].'</b></p>'
|
||||
.trim($article_content);
|
||||
|
||||
$item = array();
|
||||
$item['uri'] = $article_url;
|
||||
$item['title'] = $element->find('title', 0)->plaintext;
|
||||
$item['author'] = $article_html->find('a[rel=author]', 0)->plaintext;
|
||||
$item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext);
|
||||
$item['content'] = $article_content;
|
||||
$this->items[] = $item;
|
||||
$limit++;
|
||||
|
||||
}
|
||||
}
|
||||
return $item;
|
||||
}
|
||||
|
||||
public function collectData(){
|
||||
$feed = static::URI.'feed/';
|
||||
$this->collectExpandableDatas($feed);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue