[bridges] make them WordPressBridge derivatives

The specific content filtering used in these bridges will need to
be reintegrated later as part of the bridge or as part of the
WordPressBridge if they are considered generic enough filters,
such as the already existing WordPressBridge <script> removal filter.

Signed-off-by: Pierre Mazière <pierre.maziere@gmx.com>
This commit is contained in:
Pierre Mazière 2016-09-15 12:40:26 +02:00
parent 43ac961284
commit 3f64d2d65a
7 changed files with 32 additions and 209 deletions

View file

@ -1,30 +1,11 @@
<?php
class ArstechnicaBridge extends FeedExpander {
require_once('WordPressBridge.php');
class ArstechnicaBridge extends WordPressBridge {
const MAINTAINER = "prysme";
const NAME = "ArstechnicaBridge";
const URI = "http://arstechnica.com";
const DESCRIPTION = "The PC enthusiast's resource. Power users and the tools they love, without computing religion";
protected function parseItem($item){
$item = parent::parseItem($item);
$html = $this->getSimpleHTMLDOMCached($item['uri']);
if(!$html){
$item['content'] .= '<p>Requesting full article failed.</p>';
}else{
$item['content'] = $html->find('.article-guts', 0);
}
return $item;
}
public function collectData(){
$this->collectExpandableDatas('http://feeds.arstechnica.com/arstechnica/index/');
}
public function getCacheDuration() {
return 7200; // 2h
}
const PARAMETERS = array();
}

View file

@ -1,22 +1,11 @@
<?php
class FreenewsBridge extends FeedExpander {
require_once('WordPressBridge.php');
class FreenewsBridge extends WordPressBridge {
const MAINTAINER = "mitsukarenai";
const NAME = "Freenews";
const URI = "http://freenews.fr";
const DESCRIPTION = "Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox). Ne rentrez pas d'id si vous voulez accéder aux actualités générales.";
public function collectData(){
parent::collectExpandableDatas('http://feeds.feedburner.com/Freenews-Freebox?format=xml');
}
protected function parseItem($newsItem) {
$item = parent::parseItem($newsItem);
$articlePage = $this->getSimpleHTMLDOMCached($item['uri']);
$content = $articlePage->find('.post-container', 0);
$item['content'] = $content->innertext;
return $item;
}
const DESCRIPTION = "Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox)";
const PARAMETERS = array();
}

View file

@ -1,42 +1,13 @@
<?php
class LeJournalDuGeekBridge extends FeedExpander {
require_once('WordPressBridge.php');
class LeJournalDuGeekBridge extends WordPressBridge{
const MAINTAINER = "polopollo";
const NAME = "journaldugeek.com (FR)";
const URI = "http://www.journaldugeek.com/";
const DESCRIPTION = "Returns the 5 newest posts from LeJournalDuGeek (full text).";
public function collectData(){
$this->collectExpandableDatas(self::URI . 'rss', 5);
}
protected function parseItem($newsItem){
$item = parent::parseItem($newsItem);
$item['content'] = $this->LeJournalDuGeekExtractContent($item['uri']);
return $item;
}
private function LeJournalDuGeekExtractContent($url) {
$articleHTMLContent = $this->getSimpleHTMLDOMCached($url);
$text = $articleHTMLContent->find('div.post-content', 0)->innertext;
foreach($articleHTMLContent->find('a.more') as $element) {
if ($element->innertext == "Source") {
$text = $text . '<p><a href="' . $element->href . '">Source : ' . $element->href . '</a></p>';
break;
}
}
foreach($articleHTMLContent->find('iframe') as $element) {
if (preg_match("/youtube/i", $element->src)) {
$text = $text . '// An IFRAME to Youtube was included in the article: <a href="' . $element->src . '">' . $element->src . '</a><br>';
}
}
$text = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $text);
$text = strip_tags($text, '<p><b><a><blockquote><img><em><br/><br><ul><li>');
return $text;
}
const DESCRIPTION = "Returns the newest posts from LeJournalDuGeek (full text).";
const PARAMETERS = array();
public function getCacheDuration(){
return 1800; // 30min

View file

@ -1,61 +1,11 @@
<?php
class NakedSecurityBridge extends FeedExpander {
require_once('WordPressBridge.php');
class NakedSecurityBridge extends WordPressBridge {
const MAINTAINER = 'ORelio';
const NAME = 'Naked Security';
const URI = 'https://nakedsecurity.sophos.com/';
const DESCRIPTION = 'Returns the newest articles.';
private function StripRecursiveHTMLSection($string, $tag_name, $tag_start) {
$open_tag = '<'.$tag_name;
$close_tag = '</'.$tag_name.'>';
$close_tag_length = strlen($close_tag);
if (strpos($tag_start, $open_tag) === 0) {
while (strpos($string, $tag_start) !== false) {
$max_recursion = 100;
$section_to_remove = null;
$section_start = strpos($string, $tag_start);
$search_offset = $section_start;
do {
$max_recursion--;
$section_end = strpos($string, $close_tag, $search_offset);
$search_offset = $section_end + $close_tag_length;
$section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length);
$open_tag_count = substr_count($section_to_remove, $open_tag);
$close_tag_count = substr_count($section_to_remove, $close_tag);
} while ($open_tag_count > $close_tag_count && $max_recursion > 0);
$string = str_replace($section_to_remove, '', $string);
}
}
return $string;
}
protected function parseItem($item){
$item = parent::parseItem($item);
$article_html = $this->getSimpleHTMLDOMCached($item['uri']);
if(!$article_html){
$item['content'] = 'Could not request '.$this->getName().': '.$item['uri'];
return $item;
}
$article_image = $article_html->find('img.wp-post-image', 0)->src;
$article_content = $article_html->find('div.entry-content', 0)->innertext;
$article_content = $this->StripRecursiveHTMLSection($article_content , 'div', '<div class="entry-prefix"');
$article_content = $this->StripRecursiveHTMLSection($article_content , 'script', '<script');
$article_content = $this->StripRecursiveHTMLSection($article_content , 'aside', '<aside');
$article_content = '<p><img src="'.$article_image.'" /></p><p><b>'.$item['content'].'</b></p>'.$article_content;
$item['content'] = $article_content;
return $item;
}
public function collectData(){
$feedUrl = 'https://feeds.feedburner.com/nakedsecurity?format=xml';
$this->collectExpandableDatas($feedUrl);
}
const PARAMETERS = array();
}

View file

@ -1,29 +1,15 @@
<?php
class NumeramaBridge extends FeedExpander {
require_once('WordPressBridge.php');
class NumeramaBridge extends WordPressBridge {
const MAINTAINER = 'mitsukarenai';
const NAME = 'Numerama';
const URI = 'http://www.numerama.com/';
const DESCRIPTION = 'Returns the 5 newest posts from Numerama (full text)';
public function collectData(){
$this->collectExpandableDatas(self::URI . 'feed/', 5);
}
protected function parseItem($newsItem){
$item = parent::parseItem($newsItem);
$item['content'] = $this->ExtractContent($item['uri']);
return $item;
}
private function ExtractContent($url){
$article_html = $this->getSimpleHTMLDOMCached('Could not request Numerama: '.$url);
$contents = $article_html->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block
$contents = '<img alt="" style="max-width:300px;" src="'.$article_html->find('meta[property=og:image]', 0)->getAttribute('content').'">'; // add post picture
return $contents . $article_html->find('article[class=post-content]', 0)->innertext; // extract the post
}
const DESCRIPTION = 'Returns the newest posts from Numerama (full text)';
const PARAMETERS = array();
public function getCacheDuration() {
return 1800; // 30min
}
}

View file

@ -1,39 +1,13 @@
<?php
class SiliconBridge extends FeedExpander {
require_once('WordPressBridge.php');
class SiliconBridge extends WordPressBridge {
const MAINTAINER = "ORelio";
const NAME = 'Silicon Bridge';
const URI = 'http://www.silicon.fr/';
const DESCRIPTION = "Returns the newest articles.";
protected function parseItem($item){
$item = parent::parseItem($item);
$article_html = $this->getSimpleHTMLDOMCached($item['uri']);
if(!$article_html){
$item['content'] .= '<p>Could not request Silicon: '.$item['uri'].'</p>';
return $item;
}
$article_content = '<p><b>'.$article_html->find('div.entry-excerpt', 0)->plaintext.'</b></p>'
.$article_html->find('div.entry-content', 0)->innertext;
//Remove useless scripts left in the page
while (strpos($article_content, '<script') !== false) {
$script_section = substr($article_content, strpos($article_content, '<script'));
$script_section = substr($script_section, 0, strpos($script_section, '</script>') + 9);
$article_content = str_replace($script_section, '', $article_content);
}
$item['content'] = $article_content;
return $item;
}
public function collectData(){
$feedUrl = self::URI.'feed';
$this->collectExpandableDatas($feedUrl);
}
const PARAMETERS = array();
public function getCacheDuration() {
return 1800; // 30 minutes

View file

@ -1,41 +1,13 @@
<?php
class ZatazBridge extends BridgeAbstract {
require_once('WordPressBridge.php');
class ZatazBridge extends WordPressBridge{
const MAINTAINER = "aledeg";
const NAME = 'Zataz Magazine';
const URI = 'http://www.zataz.com';
const DESCRIPTION = "ZATAZ Magazine - S'informer, c'est déjà se sécuriser";
public function collectData(){
$html = $this->getSimpleHTMLDOM(self::URI) or $this->returnServerError('Could not request ' . self::URI);
$recent_posts = $html->find('#recent-posts-3', 0)->find('ul', 0)->find('li');
foreach ($recent_posts as $article) {
if (count($this->items) < 5) {
$uri = $article->find('a', 0)->href;
$this->items[] = $this->getDetails($uri);
}
}
}
private function getDetails($uri) {
$html = $this->getSimpleHTMLDOM($uri) or exit;
$item = array();
$article = $html->find('.gdl-blog-full', 0);
$item['uri'] = $uri;
$item['title'] = $article->find('.blog-title', 0)->find('a', 0)->innertext;
$item['content'] = $article->find('.blog-content', 0)->innertext;
$item['timestamp'] = $this->getTimestampFromDate($article->find('.blog-date', 0)->find('a', 0)->href);
return $item;
}
private function getTimestampFromDate($uri) {
preg_match('/\d{4}\/\d{2}\/\d{2}/', $uri, $matches);
$date = new \DateTime($matches[0]);
return $date->format('U');
}
const PARAMETERS = array();
public function getCacheDuration() {
return 7200; // 2h