From ff4ccf985ff20a31231ccfcd8d8d478e2adb1bde Mon Sep 17 00:00:00 2001 From: Olivier Date: Mon, 8 Dec 2014 16:53:58 +0100 Subject: [PATCH] Resolve content links Use some code to resolve content links to bypass shorteners... --- bridges/TwitterBridgeTweaked.php | 69 +++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/bridges/TwitterBridgeTweaked.php b/bridges/TwitterBridgeTweaked.php index 3b6ed130..6d7b7b83 100644 --- a/bridges/TwitterBridgeTweaked.php +++ b/bridges/TwitterBridgeTweaked.php @@ -1,6 +1,6 @@ 0) ? true : false; return $has_tld; - } + } private function cleaner($url) { $U = explode(' ', $url); $W =array(); foreach ($U as $k => $u) { - if (stristr($u,".")) { //only preg_match if there is a dot + if (stristr($u,".")) { //only preg_match if there is a dot if ($this->containsTLD($u) === true) { unset($U[$k]); return $this->cleaner( implode(' ', $U) ); - } + } } } return implode(' ', $U); } + // (c) Kraoc / urlclean + // https://github.com/kraoc/Leed-market/blob/master/urlclean/urlclean.plugin.disabled.php + private function resolve_url($link) { + // fallback to crawl to real url (slowest method and unsecure to privacy) + if (function_exists('curl_init') && !ini_get('safe_mode')) { + curl_setopt($ch, CURLOPT_USERAGENT, $ua); + curl_setopt($ch, CURLOPT_URL, $link); + curl_setopt($ch, CURLOPT_HEADER, true); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + // >>> anonimization + curl_setopt($ch, CURLOPT_COOKIESESSION, true); + curl_setopt($ch, CURLOPT_REFERER, ''); + // <<< anonimization + $ch = curl_init(); + $ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.16 (KHTML, like Gecko) Chrome/24.0.1304.0 Safari/537.16'; + $a = curl_exec($ch); + $link = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); + } + + $link = preg_replace("/[&#?]xtor=(.)+/", "", $link); // remove: xtor + $link = preg_replace("/utm_([^&#]|(&))+&*/", "", $link); // remove: utm_ + + // cleanup end of url + $link = preg_replace("/\?&/", "", $link); + if (isset($link[strlen($link) -1])){ + if ($link[strlen($link) -1] == '?') + $link = substr($link, 0, strlen($link) -1); + } + + return $link; + } + public function collectData(array $param){ - $html = ''; + $html = ''; if (isset($param['q'])) { /* keyword search mode */ $html = file_get_html('https://twitter.com/search/realtime?q='.urlencode($param['q']).'+include:retweets&src=typd') or $this->returnError('No results for this query.', 404); } @@ -54,18 +87,18 @@ class TwitterBridgeTweaked extends BridgeAbstract{ // extract username and sanitize $item->username = $tweet->getAttribute('data-screen-name'); // extract fullname (pseudonym) - $item->fullname = $tweet->getAttribute('data-name'); + $item->fullname = $tweet->getAttribute('data-name'); // get avatar link - $item->avatar = $tweet->find('img', 0)->src; + $item->avatar = $tweet->find('img', 0)->src; // get TweetID $item->id = $tweet->getAttribute('data-tweet-id'); - // get tweet link - $item->uri = 'https://twitter.com'.$tweet->find('a.js-permalink', 0)->getAttribute('href'); + // get tweet link + $item->uri = 'https://twitter.com'.$tweet->find('a.js-permalink', 0)->getAttribute('href'); // extract tweet timestamp $item->timestamp = $tweet->find('span.js-short-timestamp', 0)->getAttribute('data-time'); - // extract plaintext - $item->content_simple = str_replace('href="/', 'href="https://twitter.com/', html_entity_decode(strip_tags($tweet->find('p.js-tweet-text', 0)->innertext, ''))); - + // extract plaintext + $item->content_simple = str_replace('href="/', 'href="https://twitter.com/', html_entity_decode(strip_tags($tweet->find('p.js-tweet-text', 0)->innertext, ''))); + // processing content links foreach($tweet->find('a') as $link) { if($link->hasAttribute('data-expanded-url') ) { @@ -88,6 +121,14 @@ class TwitterBridgeTweaked extends BridgeAbstract{ $item->title = preg_replace('|www\.[a-z\.0-9]+|i', '', $item->title); // remove www. links $item->title = $this->cleaner($item->title); // remove all remaining links $item->title = trim($item->title); // remove extra spaces at beginning and end + + // convert all content links to real ones + $regex = "/(http|https|ftp|ftps)\:\/\/[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(\/\S*)?/"; + $item->content = preg_replace_callback($regex, function($url) { + // do stuff with $url[0] here + return $this->resolve_url($url[0]); + }, $item->content); + // put out $this->items[] = $item; } @@ -104,7 +145,7 @@ class TwitterBridgeTweaked extends BridgeAbstract{ public function getCacheDuration(){ return 300; // 5 minutes } - + public function getUsername(){ return $this->items[0]->username; }