From 300b8bba9b229fd5dfbd496602195a173da46901 Mon Sep 17 00:00:00 2001 From: ORelio Date: Fri, 23 Oct 2015 12:08:18 +0200 Subject: [PATCH 1/2] [Facebook] More cleaning and bypass link redirect Remove a bunch of useless html node properties Bypass all l.php facebook link redirections Restore and basic formatting --- bridges/FacebookBridge.php | 40 ++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/bridges/FacebookBridge.php b/bridges/FacebookBridge.php index 109be9bb..3df13c34 100644 --- a/bridges/FacebookBridge.php +++ b/bridges/FacebookBridge.php @@ -3,7 +3,7 @@ * @name Facebook * @homepage http://facebook.com/ * @description Input a page title or a profile log. For a profile log, please insert the parameter as follow : myExamplePage/132621766841117 -* @update 05/09/2015 +* @update 23/10/2015 * @maintainer teromene * @use1(u="username") */ @@ -13,6 +13,27 @@ class FacebookBridge extends BridgeAbstract{ public function collectData(array $param){ + //Extract a string using start and end delimiters + function ExtractFromDelimiters($string, $start, $end) { + if (strpos($string, $start) !== false) { + $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); + $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); + return $section_retrieved; + } return false; + } + + //Utility function for cleaning a Facebook link + $unescape_fb_link = function ($matches) { + if (is_array($matches) && count($matches) > 1) { + $link = $matches[1]; + if (strpos($link, '/') === 0) + $link = 'https://facebook.com'.$link.'"'; + if (strpos($link, 'facebook.com/l.php?u=') !== false) + $link = urldecode(ExtractFromDelimiters($link, 'facebook.com/l.php?u=', '&')); + return ' href="'.$link.'"'; + } + }; + $html = ''; if(isset($param['u'])) { @@ -39,16 +60,23 @@ class FacebookBridge extends BridgeAbstract{ if($post->hasAttribute("data-time")) { - //Clean the content of the page and convert relative links into absolute links + //Retrieve post contents $content = preg_replace('/(?i)>
]+)>(.+?)div\ class=\"userContent\"/i', '', $post); $content = preg_replace('/(?i)>
]+)>(.+?)<\/div><\/div>
]+)>(.+?)div\ class=\"[^u]+userContent\"/i', '', $content); $content = preg_replace('/(?i)>
]+)>(.+?)<\/div>/i', '', $content); - $content = str_replace(' href="/', ' href="https://facebook.com/', $content); - $content = preg_replace('/ onmouseover=\"[^"]+\"/i', '', $content); - $content = preg_replace('/ onclick=\"[^"]+\"/i', '', $content); + + //Remove html nodes, keep only img, links, basic formatting + $content = strip_tags($content,''); + + //Adapt link hrefs: convert relative links into absolute links and bypass external link redirection + $content = preg_replace_callback('/ href=\"([^"]+)\"/i', $unescape_fb_link, $content); + + //Clean useless html tag properties and fix link closing tags + foreach (array('onmouseover', 'onclick', 'target', 'ajaxify', 'tabindex', + 'class', 'style', 'data-[^=]*', 'aria-[^=]*', 'role', 'rel', 'id') as $property_name) + $content = preg_replace('/ '.$property_name.'=\"[^"]*\"/i', '', $content); $content = preg_replace('/<\/a [^>]+>/i', '', $content); - $content = strip_tags($content,''); //Retrieve date of the post $date = $post->find("abbr")[0]; From c8ef31bac66b4ad710405886c4afdda7ae7eb0fb Mon Sep 17 00:00:00 2001 From: ORelio Date: Fri, 23 Oct 2015 12:19:12 +0200 Subject: [PATCH 2/2] [NextInpact] Fix php notice message For non-premium articles, a notice is generated because of trying to get property from a non-object. Fixed that with a proper check using is_object --- bridges/NextInpactBridge.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bridges/NextInpactBridge.php b/bridges/NextInpactBridge.php index c91a4837..c070038a 100644 --- a/bridges/NextInpactBridge.php +++ b/bridges/NextInpactBridge.php @@ -8,7 +8,7 @@ * @homepage http://www.nextinpact.com/ * @description Returns the newest articles. * @maintainer qwertygc -* @update 2015-09-05 +* @update 2015-10-23 */ class NextInpactBridge extends BridgeAbstract { @@ -25,9 +25,9 @@ class NextInpactBridge extends BridgeAbstract { $text = '

'.$html2->find('span.sub_title', 0)->innertext.'

' .'

-

' .'
'.$html2->find('div[itemprop=articleBody]', 0)->innertext.'
'; - $premium_article = $html2->find('h2.title_reserve_article', 0)->innertext; - if (strlen($premium_article) > 0) - $text = $text.'

'.$premium_article.'

'; + $premium_article = $html2->find('h2.title_reserve_article', 0); + if (is_object($premium_article)) + $text = $text.'

'.$premium_article->innertext.'

'; return $text; }