Merge pull request #162 from ORelio/master

FaceBook improvements, NextInpact notice fix
This commit is contained in:
Mitsu 2015-10-23 14:49:21 +02:00
commit a1b4414da3
2 changed files with 38 additions and 10 deletions

View file

@ -3,7 +3,7 @@
* @name Facebook * @name Facebook
* @homepage http://facebook.com/ * @homepage http://facebook.com/
* @description Input a page title or a profile log. For a profile log, please insert the parameter as follow : myExamplePage/132621766841117 * @description Input a page title or a profile log. For a profile log, please insert the parameter as follow : myExamplePage/132621766841117
* @update 05/09/2015 * @update 23/10/2015
* @maintainer teromene * @maintainer teromene
* @use1(u="username") * @use1(u="username")
*/ */
@ -13,6 +13,27 @@ class FacebookBridge extends BridgeAbstract{
public function collectData(array $param){ public function collectData(array $param){
//Extract a string using start and end delimiters
function ExtractFromDelimiters($string, $start, $end) {
if (strpos($string, $start) !== false) {
$section_retrieved = substr($string, strpos($string, $start) + strlen($start));
$section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end));
return $section_retrieved;
} return false;
}
//Utility function for cleaning a Facebook link
$unescape_fb_link = function ($matches) {
if (is_array($matches) && count($matches) > 1) {
$link = $matches[1];
if (strpos($link, '/') === 0)
$link = 'https://facebook.com'.$link.'"';
if (strpos($link, 'facebook.com/l.php?u=') !== false)
$link = urldecode(ExtractFromDelimiters($link, 'facebook.com/l.php?u=', '&'));
return ' href="'.$link.'"';
}
};
$html = ''; $html = '';
if(isset($param['u'])) { if(isset($param['u'])) {
@ -39,16 +60,23 @@ class FacebookBridge extends BridgeAbstract{
if($post->hasAttribute("data-time")) { if($post->hasAttribute("data-time")) {
//Clean the content of the page and convert relative links into absolute links //Retrieve post contents
$content = preg_replace('/(?i)><div class=\"clearfix([^>]+)>(.+?)div\ class=\"userContent\"/i', '', $post); $content = preg_replace('/(?i)><div class=\"clearfix([^>]+)>(.+?)div\ class=\"userContent\"/i', '', $post);
$content = preg_replace('/(?i)><div class=\"_59tj([^>]+)>(.+?)<\/div><\/div><a/i', '', $content); $content = preg_replace('/(?i)><div class=\"_59tj([^>]+)>(.+?)<\/div><\/div><a/i', '', $content);
$content = preg_replace('/(?i)><div class=\"_3dp([^>]+)>(.+?)div\ class=\"[^u]+userContent\"/i', '', $content); $content = preg_replace('/(?i)><div class=\"_3dp([^>]+)>(.+?)div\ class=\"[^u]+userContent\"/i', '', $content);
$content = preg_replace('/(?i)><div class=\"_4l5([^>]+)>(.+?)<\/div>/i', '', $content); $content = preg_replace('/(?i)><div class=\"_4l5([^>]+)>(.+?)<\/div>/i', '', $content);
$content = str_replace(' href="/', ' href="https://facebook.com/', $content);
$content = preg_replace('/ onmouseover=\"[^"]+\"/i', '', $content); //Remove html nodes, keep only img, links, basic formatting
$content = preg_replace('/ onclick=\"[^"]+\"/i', '', $content); $content = strip_tags($content,'<a><img><i><u>');
//Adapt link hrefs: convert relative links into absolute links and bypass external link redirection
$content = preg_replace_callback('/ href=\"([^"]+)\"/i', $unescape_fb_link, $content);
//Clean useless html tag properties and fix link closing tags
foreach (array('onmouseover', 'onclick', 'target', 'ajaxify', 'tabindex',
'class', 'style', 'data-[^=]*', 'aria-[^=]*', 'role', 'rel', 'id') as $property_name)
$content = preg_replace('/ '.$property_name.'=\"[^"]*\"/i', '', $content);
$content = preg_replace('/<\/a [^>]+>/i', '</a>', $content); $content = preg_replace('/<\/a [^>]+>/i', '</a>', $content);
$content = strip_tags($content,'<a><img>');
//Retrieve date of the post //Retrieve date of the post
$date = $post->find("abbr")[0]; $date = $post->find("abbr")[0];

View file

@ -8,7 +8,7 @@
* @homepage http://www.nextinpact.com/ * @homepage http://www.nextinpact.com/
* @description Returns the newest articles. * @description Returns the newest articles.
* @maintainer qwertygc * @maintainer qwertygc
* @update 2015-09-05 * @update 2015-10-23
*/ */
class NextInpactBridge extends BridgeAbstract { class NextInpactBridge extends BridgeAbstract {
@ -25,9 +25,9 @@ class NextInpactBridge extends BridgeAbstract {
$text = '<p><em>'.$html2->find('span.sub_title', 0)->innertext.'</em></p>' $text = '<p><em>'.$html2->find('span.sub_title', 0)->innertext.'</em></p>'
.'<p><img src="'.$html2->find('div.container_main_image_article', 0)->find('img.dedicated',0)->src.'" alt="-" /></p>' .'<p><img src="'.$html2->find('div.container_main_image_article', 0)->find('img.dedicated',0)->src.'" alt="-" /></p>'
.'<div>'.$html2->find('div[itemprop=articleBody]', 0)->innertext.'</div>'; .'<div>'.$html2->find('div[itemprop=articleBody]', 0)->innertext.'</div>';
$premium_article = $html2->find('h2.title_reserve_article', 0)->innertext; $premium_article = $html2->find('h2.title_reserve_article', 0);
if (strlen($premium_article) > 0) if (is_object($premium_article))
$text = $text.'<p><em>'.$premium_article.'</em></p>'; $text = $text.'<p><em>'.$premium_article->innertext.'</em></p>';
return $text; return $text;
} }