[bridges] Change all bridges to use BridgeAbstract with getSimpleHTMLDOMCached
This commit is contained in:
parent
9f2dd48684
commit
2eec89ab27
21 changed files with 27 additions and 27 deletions
|
@ -16,7 +16,7 @@ class CADBridge extends FeedExpander {
|
||||||
}
|
}
|
||||||
|
|
||||||
private function CADExtractContent($url) {
|
private function CADExtractContent($url) {
|
||||||
$html3 = $this->get_cached($url);
|
$html3 = $this->getSimpleHTMLDOMCached($url);
|
||||||
|
|
||||||
// The request might fail due to missing https support or wrong URL
|
// The request might fail due to missing https support or wrong URL
|
||||||
if($html3 == false)
|
if($html3 == false)
|
||||||
|
|
|
@ -17,7 +17,7 @@ class CommonDreamsBridge extends FeedExpander {
|
||||||
}
|
}
|
||||||
|
|
||||||
private function CommonDreamsExtractContent($url) {
|
private function CommonDreamsExtractContent($url) {
|
||||||
$html3 = $this->get_cached($url);
|
$html3 = $this->getSimpleHTMLDOMCached($url);
|
||||||
$text = $html3->find('div[class=field--type-text-with-summary]', 0)->innertext;
|
$text = $html3->find('div[class=field--type-text-with-summary]', 0)->innertext;
|
||||||
$html3->clear();
|
$html3->clear();
|
||||||
unset ($html3);
|
unset ($html3);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
<?php
|
<?php
|
||||||
class CpasbienBridge extends HttpCachingBridgeAbstract{
|
class CpasbienBridge extends BridgeAbstract {
|
||||||
|
|
||||||
const MAINTAINER = "lagaisse";
|
const MAINTAINER = "lagaisse";
|
||||||
const NAME = "Cpasbien Bridge";
|
const NAME = "Cpasbien Bridge";
|
||||||
|
@ -23,7 +23,7 @@ class CpasbienBridge extends HttpCachingBridgeAbstract{
|
||||||
if ($episode->getAttribute('class')=='ligne0' ||
|
if ($episode->getAttribute('class')=='ligne0' ||
|
||||||
$episode->getAttribute('class')=='ligne1')
|
$episode->getAttribute('class')=='ligne1')
|
||||||
{
|
{
|
||||||
$htmlepisode=$this->get_cached($episode->find('a', 0)->getAttribute('href'));
|
$htmlepisode=$this->getSimpleHTMLDOMCached($episode->find('a', 0)->getAttribute('href'));
|
||||||
|
|
||||||
$item = array();
|
$item = array();
|
||||||
$item['author'] = $episode->find('a', 0)->text();
|
$item['author'] = $episode->find('a', 0)->text();
|
||||||
|
|
|
@ -47,7 +47,7 @@ class DauphineLibereBridge extends FeedExpander {
|
||||||
}
|
}
|
||||||
|
|
||||||
private function ExtractContent($url) {
|
private function ExtractContent($url) {
|
||||||
$html2 = $this->get_cached($url);
|
$html2 = $this->getSimpleHTMLDOMCached($url);
|
||||||
$text = $html2->find('div.column', 0)->innertext;
|
$text = $html2->find('div.column', 0)->innertext;
|
||||||
$text = preg_replace('@<script[^>]*?>.*?</script>@si', '', $text);
|
$text = preg_replace('@<script[^>]*?>.*?</script>@si', '', $text);
|
||||||
return $text;
|
return $text;
|
||||||
|
|
|
@ -42,7 +42,7 @@ class DeveloppezDotComBridge extends FeedExpander {
|
||||||
}
|
}
|
||||||
|
|
||||||
private function DeveloppezDotComExtractContent($url) {
|
private function DeveloppezDotComExtractContent($url) {
|
||||||
$articleHTMLContent = $this->get_cached($url);
|
$articleHTMLContent = $this->getSimpleHTMLDOMCached($url);
|
||||||
$text = $this->convert_smart_quotes($articleHTMLContent->find('div.content', 0)->innertext);
|
$text = $this->convert_smart_quotes($articleHTMLContent->find('div.content', 0)->innertext);
|
||||||
$text = utf8_encode($text);
|
$text = utf8_encode($text);
|
||||||
return trim($text);
|
return trim($text);
|
||||||
|
|
|
@ -13,7 +13,7 @@ class FreenewsBridge extends FeedExpander {
|
||||||
protected function parseItem($newsItem) {
|
protected function parseItem($newsItem) {
|
||||||
$item = $this->parseRSS_2_0_Item($newsItem);
|
$item = $this->parseRSS_2_0_Item($newsItem);
|
||||||
|
|
||||||
$articlePage = $this->get_cached($item['uri']);
|
$articlePage = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||||
$content = $articlePage->find('.post-container', 0);
|
$content = $articlePage->find('.post-container', 0);
|
||||||
$item['content'] = $content->innertext;
|
$item['content'] = $content->innertext;
|
||||||
|
|
||||||
|
|
|
@ -86,7 +86,7 @@ class FuturaSciencesBridge extends FeedExpander {
|
||||||
protected function parseItem($newsItem){
|
protected function parseItem($newsItem){
|
||||||
$item = $this->parseRSS_2_0_Item($newsItem);
|
$item = $this->parseRSS_2_0_Item($newsItem);
|
||||||
$item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']);
|
$item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']);
|
||||||
$article = $this->get_cached($item['uri'])
|
$article = $this->getSimpleHTMLDOMCached($item['uri'])
|
||||||
or $this->returnServerError('Could not request Futura-Sciences: ' . $item['uri']);
|
or $this->returnServerError('Could not request Futura-Sciences: ' . $item['uri']);
|
||||||
$item['content'] = $this->ExtractArticleContent($article);
|
$item['content'] = $this->ExtractArticleContent($article);
|
||||||
$item['author'] = empty($this->ExtractAuthor($article)) ? $item['author'] : $this->ExtractAuthor($article);
|
$item['author'] = empty($this->ExtractAuthor($article)) ? $item['author'] : $this->ExtractAuthor($article);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
<?php
|
<?php
|
||||||
class JapanExpoBridge extends HttpCachingBridgeAbstract {
|
class JapanExpoBridge extends BridgeAbstract {
|
||||||
|
|
||||||
const MAINTAINER = 'Ginko';
|
const MAINTAINER = 'Ginko';
|
||||||
const NAME = 'Japan Expo Actualités';
|
const NAME = 'Japan Expo Actualités';
|
||||||
|
@ -60,7 +60,7 @@ class JapanExpoBridge extends HttpCachingBridgeAbstract {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
$article_html = $this->get_cached($url) or $this->returnServerError('Could not request JapanExpo: '.$url);
|
$article_html = $this->getSimpleHTMLDOMCached('Could not request JapanExpo: '.$url);
|
||||||
$header = $article_html->find('header.pageHeadBox', 0);
|
$header = $article_html->find('header.pageHeadBox', 0);
|
||||||
$timestamp = strtotime($header->find('time', 0)->datetime);
|
$timestamp = strtotime($header->find('time', 0)->datetime);
|
||||||
$title_html = $header->find('div.section', 0)->next_sibling();
|
$title_html = $header->find('div.section', 0)->next_sibling();
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
<?php
|
<?php
|
||||||
class KununuBridge extends HttpCachingBridgeAbstract {
|
class KununuBridge extends BridgeAbstract {
|
||||||
const MAINTAINER = "logmanoriginal";
|
const MAINTAINER = "logmanoriginal";
|
||||||
const NAME = "Kununu Bridge";
|
const NAME = "Kununu Bridge";
|
||||||
const URI = "https://www.kununu.com/";
|
const URI = "https://www.kununu.com/";
|
||||||
|
@ -224,7 +224,7 @@ class KununuBridge extends HttpCachingBridgeAbstract {
|
||||||
*/
|
*/
|
||||||
private function extract_full_description($uri){
|
private function extract_full_description($uri){
|
||||||
// Load full article
|
// Load full article
|
||||||
$html = $this->get_cached($uri);
|
$html = $this->getSimpleHTMLDOMCached($uri);
|
||||||
if($html === false)
|
if($html === false)
|
||||||
$this->returnServerError('Could not load full description!');
|
$this->returnServerError('Could not load full description!');
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,7 @@ class LeJournalDuGeekBridge extends FeedExpander {
|
||||||
}
|
}
|
||||||
|
|
||||||
private function LeJournalDuGeekExtractContent($url) {
|
private function LeJournalDuGeekExtractContent($url) {
|
||||||
$articleHTMLContent = $this->get_cached($url);
|
$articleHTMLContent = $this->getSimpleHTMLDOMCached($url);
|
||||||
$text = $articleHTMLContent->find('div.post-content', 0)->innertext;
|
$text = $articleHTMLContent->find('div.post-content', 0)->innertext;
|
||||||
|
|
||||||
foreach($articleHTMLContent->find('a.more') as $element) {
|
foreach($articleHTMLContent->find('a.more') as $element) {
|
||||||
|
|
|
@ -12,7 +12,7 @@ class LeMondeInformatiqueBridge extends FeedExpander {
|
||||||
|
|
||||||
protected function parseItem($newsItem){
|
protected function parseItem($newsItem){
|
||||||
$item = $this->parseRSS_1_0_Item($newsItem);
|
$item = $this->parseRSS_1_0_Item($newsItem);
|
||||||
$article_html = $this->get_cached($item['uri'])
|
$article_html = $this->getSimpleHTMLDOMCached($item['uri'])
|
||||||
or $this->returnServerError('Could not request LeMondeInformatique: ' . $item['uri']);
|
or $this->returnServerError('Could not request LeMondeInformatique: ' . $item['uri']);
|
||||||
$item['content'] = $this->CleanArticle($article_html->find('div#article', 0)->innertext);
|
$item['content'] = $this->CleanArticle($article_html->find('div#article', 0)->innertext);
|
||||||
$item['title'] = $article_html->find('h1.cleanprint-title', 0)->plaintext;
|
$item['title'] = $article_html->find('h1.cleanprint-title', 0)->plaintext;
|
||||||
|
|
|
@ -17,7 +17,7 @@ class LichessBridge extends FeedExpander {
|
||||||
}
|
}
|
||||||
|
|
||||||
private function retrieve_lichess_post($blog_post_uri){
|
private function retrieve_lichess_post($blog_post_uri){
|
||||||
$blog_post_html = $this->get_cached($blog_post_uri);
|
$blog_post_html = $this->getSimpleHTMLDOMCached($blog_post_uri);
|
||||||
$blog_post_div = $blog_post_html->find('#lichess_blog', 0);
|
$blog_post_div = $blog_post_html->find('#lichess_blog', 0);
|
||||||
|
|
||||||
$post_chapo = $blog_post_div->find('.shortlede', 0)->innertext;
|
$post_chapo = $blog_post_div->find('.shortlede', 0)->innertext;
|
||||||
|
|
|
@ -17,7 +17,7 @@ class NextInpactBridge extends FeedExpander {
|
||||||
}
|
}
|
||||||
|
|
||||||
private function ExtractContent($url) {
|
private function ExtractContent($url) {
|
||||||
$html2 = $this->get_cached($url);
|
$html2 = $this->getSimpleHTMLDOMCached($url);
|
||||||
$text = '<p><em>'.$html2->find('span.sub_title', 0)->innertext.'</em></p>'
|
$text = '<p><em>'.$html2->find('span.sub_title', 0)->innertext.'</em></p>'
|
||||||
.'<p><img src="'.$html2->find('div.container_main_image_article', 0)->find('img.dedicated',0)->src.'" alt="-" /></p>'
|
.'<p><img src="'.$html2->find('div.container_main_image_article', 0)->find('img.dedicated',0)->src.'" alt="-" /></p>'
|
||||||
.'<div>'.$html2->find('div[itemprop=articleBody]', 0)->innertext.'</div>';
|
.'<div>'.$html2->find('div[itemprop=articleBody]', 0)->innertext.'</div>';
|
||||||
|
|
|
@ -56,7 +56,7 @@ class NextgovBridge extends FeedExpander {
|
||||||
}
|
}
|
||||||
|
|
||||||
private function ExtractContent($url){
|
private function ExtractContent($url){
|
||||||
$article = $this->get_cached($url)
|
$article = $this->getSimpleHTMLDOMCached($url)
|
||||||
or $this->returnServerError('Could not request Nextgov: ' . $url);
|
or $this->returnServerError('Could not request Nextgov: ' . $url);
|
||||||
|
|
||||||
$contents = $article->find('div.wysiwyg', 0)->innertext;
|
$contents = $article->find('div.wysiwyg', 0)->innertext;
|
||||||
|
|
|
@ -17,7 +17,7 @@ class NiceMatinBridge extends FeedExpander {
|
||||||
}
|
}
|
||||||
|
|
||||||
private function NiceMatinExtractContent($url) {
|
private function NiceMatinExtractContent($url) {
|
||||||
$html = $this->get_cached($url);
|
$html = $this->getSimpleHTMLDOMCached($url);
|
||||||
if(!$html)
|
if(!$html)
|
||||||
return 'Could not acquire content from url: ' . $url . '!';
|
return 'Could not acquire content from url: ' . $url . '!';
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,7 @@ class NumeramaBridge extends FeedExpander {
|
||||||
}
|
}
|
||||||
|
|
||||||
private function ExtractContent($url){
|
private function ExtractContent($url){
|
||||||
$article_html = $this->get_cached($url) or $this->returnServerError('Could not request Numerama: '.$url);
|
$article_html = $this->getSimpleHTMLDOMCached('Could not request Numerama: '.$url);
|
||||||
$contents = $article_html->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block
|
$contents = $article_html->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block
|
||||||
$contents = '<img alt="" style="max-width:300px;" src="'.$article_html->find('meta[property=og:image]', 0)->getAttribute('content').'">'; // add post picture
|
$contents = '<img alt="" style="max-width:300px;" src="'.$article_html->find('meta[property=og:image]', 0)->getAttribute('content').'">'; // add post picture
|
||||||
return $contents . $article_html->find('article[class=post-content]', 0)->innertext; // extract the post
|
return $contents . $article_html->find('article[class=post-content]', 0)->innertext; // extract the post
|
||||||
|
|
|
@ -13,7 +13,7 @@ class TheOatmealBridge extends FeedExpander{
|
||||||
protected function parseItem($newsItem) {
|
protected function parseItem($newsItem) {
|
||||||
$item = $this->parseRSS_1_0_Item($newsItem);
|
$item = $this->parseRSS_1_0_Item($newsItem);
|
||||||
|
|
||||||
$articlePage = $this->get_cached($item['uri']);
|
$articlePage = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||||
$content = $articlePage->find('#comic', 0);
|
$content = $articlePage->find('#comic', 0);
|
||||||
if(is_null($content)) // load alternative
|
if(is_null($content)) // load alternative
|
||||||
$content = $articlePage->find('#blog', 0);
|
$content = $articlePage->find('#blog', 0);
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
define('WIKIPEDIA_SUBJECT_TFA', 0); // Today's featured article
|
define('WIKIPEDIA_SUBJECT_TFA', 0); // Today's featured article
|
||||||
define('WIKIPEDIA_SUBJECT_DYK', 1); // Did you know...
|
define('WIKIPEDIA_SUBJECT_DYK', 1); // Did you know...
|
||||||
|
|
||||||
class WikipediaBridge extends HttpCachingBridgeAbstract {
|
class WikipediaBridge extends BridgeAbstract {
|
||||||
const MAINTAINER = 'logmanoriginal';
|
const MAINTAINER = 'logmanoriginal';
|
||||||
const NAME = 'Wikipedia bridge for many languages';
|
const NAME = 'Wikipedia bridge for many languages';
|
||||||
const URI = 'https://www.wikipedia.org/';
|
const URI = 'https://www.wikipedia.org/';
|
||||||
|
@ -175,7 +175,7 @@ class WikipediaBridge extends HttpCachingBridgeAbstract {
|
||||||
* Loads the full article from a given URI
|
* Loads the full article from a given URI
|
||||||
*/
|
*/
|
||||||
private function LoadFullArticle($uri){
|
private function LoadFullArticle($uri){
|
||||||
$content_html = $this->get_cached($uri);
|
$content_html = $this->getSimpleHTMLDOMCached($uri);
|
||||||
|
|
||||||
if(!$content_html)
|
if(!$content_html)
|
||||||
$this->returnServerError('Could not load site: ' . $uri . '!');
|
$this->returnServerError('Could not load site: ' . $uri . '!');
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
<?php
|
<?php
|
||||||
define('WORDPRESS_TYPE_ATOM', 1); // Content is of type ATOM
|
define('WORDPRESS_TYPE_ATOM', 1); // Content is of type ATOM
|
||||||
define('WORDPRESS_TYPE_RSS', 2); // Content is of type RSS
|
define('WORDPRESS_TYPE_RSS', 2); // Content is of type RSS
|
||||||
class WordPressBridge extends HttpCachingBridgeAbstract {
|
class WordPressBridge extends BridgeAbstract {
|
||||||
|
|
||||||
public $sitename; // Name of the site
|
public $sitename; // Name of the site
|
||||||
|
|
||||||
|
@ -82,7 +82,7 @@ class WordPressBridge extends HttpCachingBridgeAbstract {
|
||||||
$item['timestamp'] = strtotime($article->find('updated', 0)->innertext);
|
$item['timestamp'] = strtotime($article->find('updated', 0)->innertext);
|
||||||
}
|
}
|
||||||
|
|
||||||
$article_html = $this->get_cached($item['uri']);
|
$article_html = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||||
|
|
||||||
// Attempt to find most common content div
|
// Attempt to find most common content div
|
||||||
if(!isset($item['content'])){
|
if(!isset($item['content'])){
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
<?php
|
<?php
|
||||||
class WorldOfTanksBridge extends HttpCachingBridgeAbstract{
|
class WorldOfTanksBridge extends BridgeAbstract {
|
||||||
|
|
||||||
const MAINTAINER = "mitsukarenai";
|
const MAINTAINER = "mitsukarenai";
|
||||||
const NAME = "World of Tanks";
|
const NAME = "World of Tanks";
|
||||||
|
@ -57,7 +57,7 @@ class WorldOfTanksBridge extends HttpCachingBridgeAbstract{
|
||||||
$item['uri'] = self::URI.$infoLink->href;
|
$item['uri'] = self::URI.$infoLink->href;
|
||||||
// now load that uri from cache
|
// now load that uri from cache
|
||||||
$this->debugMessage("loading page ".$item['uri']);
|
$this->debugMessage("loading page ".$item['uri']);
|
||||||
$articlePage = $this->get_cached($item['uri']);
|
$articlePage = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||||
$content = $articlePage->find('.l-content', 0);
|
$content = $articlePage->find('.l-content', 0);
|
||||||
HTMLSanitizer::defaultImageSrcTo($content, self::URI);
|
HTMLSanitizer::defaultImageSrcTo($content, self::URI);
|
||||||
$item['title'] = $content->find('h1', 0)->innertext;
|
$item['title'] = $content->find('h1', 0)->innertext;
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
<?php
|
<?php
|
||||||
require_once(__DIR__ . '/BridgeInterface.php');
|
require_once(__DIR__ . '/BridgeInterface.php');
|
||||||
abstract class FeedExpander extends HttpCachingBridgeAbstract {
|
abstract class FeedExpander extends BridgeAbstract {
|
||||||
|
|
||||||
private $name;
|
private $name;
|
||||||
private $uri;
|
private $uri;
|
||||||
|
|
Loading…
Reference in a new issue