From 1a4a4284493eaaa2bc707640369edcabc734571a Mon Sep 17 00:00:00 2001 From: Teromene Date: Fri, 27 Nov 2015 14:20:33 +0000 Subject: [PATCH] =?UTF-8?q?Correction=20de=20quelques=20probl=C3=A8mes=20a?= =?UTF-8?q?vec=20RSSExpander=20suite=20=C3=A0=20la=20migration=20dans=20li?= =?UTF-8?q?b/Bridge.=20Correction=20de=20typos.=20Ajout=20de=20la=20possib?= =?UTF-8?q?ilit=C3=A9=20d'utiliser=20un=20proxy.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bridges/Gawker.php | 6 ++--- bridges/WorldOfTanks.php | 2 +- index.php | 3 +++ lib/Bridge.php | 33 ++++++++++++++---------- lib/HTMLUtils.php | 11 +++++--- vendor/simplehtmldom/simple_html_dom.php | 4 +-- 6 files changed, 37 insertions(+), 22 deletions(-) diff --git a/bridges/Gawker.php b/bridges/Gawker.php index efb75d82..624e66db 100644 --- a/bridges/Gawker.php +++ b/bridges/Gawker.php @@ -31,10 +31,10 @@ class Gawker extends RssExpander{ trigger_error("If no site is provided, nothing is gonna happen", E_USER_ERROR); } else { $this->name = $param['site']; - $param['url'] = $this->toURI(strtolower($param['site'])); + $url = $this->toURI(strtolower($param['site'])); } // $this->message("loading feed from ".$this->getURI()); - parent::collectExpandableDatas($param, $name); + parent::collectExpandableDatas($param, $url); } protected function parseRSSItem($newsItem) { @@ -49,7 +49,7 @@ class Gawker extends RssExpander{ $articlePage = str_get_html($this->get_cached($item->uri)); if(is_object($articlePage)) { $content = $articlePage->find('.post-content', 0); - $this->defaultImageSrcTo($content, $this->getURI()); + HTMLSanitizer::defaultImageSrcTo($content, $this->getURI()); $vcard = $articlePage->find('.vcard', 0); if(is_object($vcard)) { $authorLink = $vcard->find('a', 0); diff --git a/bridges/WorldOfTanks.php b/bridges/WorldOfTanks.php index c6dc7efc..adaf22c3 100644 --- a/bridges/WorldOfTanks.php +++ b/bridges/WorldOfTanks.php @@ -86,7 +86,7 @@ class WorldOfTanks extends HttpCachingBridgeAbstract{ // $this->message("loading page ".$item->uri); $articlePage = str_get_html($this->get_cached($item->uri)); $content = $articlePage->find('.l-content', 0); - $this->defaultImageSrcTo($content, WORLD_OF_TANKS); + HTMLSanitizer::defaultImageSrcTo($content, WORLD_OF_TANKS); $item->title = $content->find('h1', 0)->innertext; $item->content = $content->find('.b-content', 0)->innertext; // $item->name = $auteur->innertext; diff --git a/index.php b/index.php index 31e1e0b5..a2546d68 100644 --- a/index.php +++ b/index.php @@ -11,6 +11,8 @@ TODO : - implement header('X-Cached-Version: '.date(DATE_ATOM, filemtime($cachefile))); */ +//define('PROXY_URL', 'tcp://192.168.0.0:28'); + date_default_timezone_set('UTC'); error_reporting(0); //ini_set('display_errors','1'); error_reporting(E_ALL); // For debugging only. @@ -21,6 +23,7 @@ if (!extension_loaded('openssl')) // FIXME : beta test UA spoofing, please report any blacklisting by PHP-fopen-unfriendly websites ini_set('user_agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:30.0) Gecko/20121202 Firefox/30.0 (rss-bridge/0.1; +https://github.com/sebsauvage/rss-bridge)'); + // ------- // cache file purge - delete cache files older than 24 hours diff --git a/lib/Bridge.php b/lib/Bridge.php index 6fb67b7f..3a104a1b 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -39,7 +39,7 @@ abstract class BridgeAbstract implements BridgeInterface{ } /** - * Return datas store in the bridge + * Return datas stored in the bridge * @return mixed */ public function getDatas(){ @@ -50,7 +50,7 @@ abstract class BridgeAbstract implements BridgeInterface{ /** * Defined datas with parameters depending choose bridge - * Note : you can defined a cache before with "setCache" + * Note : you can define a cache before with "setCache" * @param array $param $_REQUEST, $_GET, $_POST, or array with bridge expected paramters */ public function setDatas(array $param){ @@ -90,16 +90,6 @@ abstract class BridgeAbstract implements BridgeInterface{ return $this; } - /** - * Set default image SRC attribute to point on given server when none is provided (that's to say when image src starts with '/' - */ - public function defaultImageSrcTo($content, $server) { - foreach($content->find('img') as $image) { - if(strpos($image->src, '/')==0) { - $image->src = $server.$image->src; - } - } - } } /** @@ -311,7 +301,7 @@ abstract class RssExpander extends HttpCachingBridgeAbstract{ public function collectExpandableDatas(array $param, $name){ if (empty($name)) { - $this->returnError('There is no $param[\'url\'] for this RSS expander', 404); + $this->returnError('There is no $name for this RSS expander', 404); } // $this->message("Loading from ".$param['url']); // Notice WE DO NOT use cache here on purpose : we want a fresh view of the RSS stream each time @@ -363,3 +353,20 @@ abstract class RssExpander extends HttpCachingBridgeAbstract{ return $this->description; } } + +function advanced_file_get_contents($url) { + + if(defined('PROXY_URL')) { + $context = array( + 'http' => array( + 'proxy' => PROXY_URL, + 'request_fulluri' => true, + ), + ); + $context = stream_context_create($context); + return file_get_contents($url, false, $context); + } else { + return file_get_contents($url); + } + +} diff --git a/lib/HTMLUtils.php b/lib/HTMLUtils.php index ff25130d..9f097230 100644 --- a/lib/HTMLUtils.php +++ b/lib/HTMLUtils.php @@ -91,7 +91,7 @@ CARD; } else if($inputEntry['type'] == 'number') { $card .= '
' . PHP_EOL; } else if($inputEntry['type'] == 'list') { - $card .= ''; foreach($inputEntry['values'] as $listValues) { $card .= ""; @@ -157,9 +157,7 @@ class HTMLSanitizer { $element->outertext = ''; } else { foreach($element->getAllAttributes() as $attributeName => $attribute) { - if(!in_array($attributeName, $this->keptAttributes)) $element->removeAttribute($attributeName); - } } } @@ -167,6 +165,13 @@ class HTMLSanitizer { return $htmlContent; } + public static function defaultImageSrcTo($content, $server) { + foreach($content->find('img') as $image) { + if(strpos($image->src, '/')==0) { + $image->src = $server.$image->src; + } + } + } } ?> diff --git a/vendor/simplehtmldom/simple_html_dom.php b/vendor/simplehtmldom/simple_html_dom.php index b5d30898..22aaa340 100644 --- a/vendor/simplehtmldom/simple_html_dom.php +++ b/vendor/simplehtmldom/simple_html_dom.php @@ -73,7 +73,7 @@ function file_get_html($url, $use_include_path = false, $context=null, $offset = // We DO force the tags to be terminated. $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done. - $contents = file_get_contents($url, $use_include_path, $context, $offset); + $contents = advanced_file_get_contents($url, $use_include_path, $context, $offset); // Paperg - use our own mechanism for getting the contents as we want to control the timeout. //$contents = retrieve_url_contents($url); if (empty($contents) || strlen($contents) > MAX_FILE_SIZE) @@ -1094,7 +1094,7 @@ class simple_html_dom function load_file() { $args = func_get_args(); - $this->load(call_user_func_array('file_get_contents', $args), true); + $this->load(call_user_func_array('advanced_file_get_contents', $args), true); // Throw an error if we can't properly load the dom. if (($error=error_get_last())!==null) { $this->clear();