Merge pull request #205 from ORelio/master

Facebook captcha proxy & fix CNET bridge
This commit is contained in:
Mitsu 2016-01-26 21:19:51 +01:00
commit abce3186d4
2 changed files with 104 additions and 37 deletions

View file

@ -3,22 +3,22 @@ class CNETBridge extends BridgeAbstract {
private $topicName = '';
public function loadMetadatas() {
public function loadMetadatas() {
$this->maintainer = "ORelio";
$this->name = "CNET News";
$this->uri = "http://www.cnet.com/";
$this->description = "Returns the newest articles. <br /> You may specify a topic, else all topics are selected.";
$this->update = "2015-09-10";
$this->maintainer = 'ORelio';
$this->name = 'CNET News';
$this->uri = 'http://www.cnet.com/';
$this->description = 'Returns the newest articles. <br /> You may specify a topic found in some section URLs, else all topics are selected.';
$this->update = '2016-01-23';
$this->parameters[] =
'[
{
"name" : "Topic name",
"identifier" : "topic"
}
]';
}
$this->parameters[] =
'[
{
"name" : "Topic name",
"identifier" : "topic"
}
]';
}
public function collectData(array $param) {
@ -39,7 +39,8 @@ class CNETBridge extends BridgeAbstract {
}
function CleanArticle($article_html) {
$article_html = '<p>'.substr($article_html, strpos($article_html, '</script></div><p>') + 18);
$article_html = '<p>'.substr($article_html, strpos($article_html, '<p>') + 3);
$article_html = StripWithDelimiters($article_html, '<span class="credit">', '</span>');
$article_html = StripWithDelimiters($article_html, '<script>', '</script>');
$article_html = StripWithDelimiters($article_html, '<div class="shortcode related-links', '</div>');
$article_html = StripWithDelimiters($article_html, '<a class="clickToEnlarge">', '</a>');
@ -53,27 +54,29 @@ class CNETBridge extends BridgeAbstract {
$html = file_get_html($pageUrl) or $this->returnError('Could not request CNET: '.$pageUrl, 500);
$limit = 0;
foreach($html->find('div.socialSharingSmall') as $element) {
if ($limit < 4) {
foreach($html->find('div.assetBody') as $element) {
if ($limit < 8) {
$article_meta = json_decode(ExtractFromDelimiters($element->outertext, 'data-social-counts-options=\'', '\'>'));
$article_title = $article_meta->title;
$article_uri = $article_meta->url;
$article_title = trim($element->find('h2', 0)->plaintext);
$article_uri = 'http://www.cnet.com'.($element->find('a', 0)->href);
$article_thumbnail = $element->parent()->find('img', 0)->src;
$article_timestamp = strtotime($element->find('time.assetTime', 0)->plaintext);
$article_author = trim($element->find('a[rel=author]', 0)->plaintext);
if (!empty($article_title) && !empty($article_uri) && strpos($article_uri, '/news/') !== false) {
$article_html = file_get_html($article_uri) or $this->returnError('Could not request CNET: '.$article_uri, 500);
$article_timestamp = strtotime(ExtractFromDelimiters($article_html->innertext, '<time itemprop="datePublished" class="', '">'));
$article_thumbnail = $article_html->find('div.originalImage', 0);
if (is_null($article_thumbnail))
$article_thumbnail = $article_html->find('div.originalImage', 0);
if (is_null($article_thumbnail))
$article_thumbnail = $article_html->find('span.imageContainer', 0);
if (!is_null($article_thumbnail))
if (is_object($article_thumbnail))
$article_thumbnail = $article_thumbnail->find('img', 0)->src;
$article_content = trim(CleanArticle(ExtractFromDelimiters($article_html, '</div></div></div><div class="col-8">', '<footer>')));
$article_author = trim($article_html->find('a.author', 0)->plaintext);
$item = new \Item();
$item->uri = $article_uri;

View file

@ -5,7 +5,7 @@ class FacebookBridge extends BridgeAbstract{
$this->maintainer = "teromene";
$this->name = "Facebook";
$this->uri = "http://facebook.com/";
$this->uri = "http://www.facebook.com/";
$this->description = "Input a page title or a profile log. For a profile log, please insert the parameter as follow : myExamplePage/132621766841117";
$this->update = "23/10/2015";
@ -19,7 +19,7 @@ class FacebookBridge extends BridgeAbstract{
]';
}
public function collectData(array $param){
public function collectData(array $param) {
//Extract a string using start and end delimiters
function ExtractFromDelimiters($string, $start, $end) {
@ -35,7 +35,7 @@ class FacebookBridge extends BridgeAbstract{
if (is_array($matches) && count($matches) > 1) {
$link = $matches[1];
if (strpos($link, '/') === 0)
$link = 'https://facebook.com'.$link.'"';
$link = 'https://www.facebook.com'.$link.'"';
if (strpos($link, 'facebook.com/l.php?u=') !== false)
$link = urldecode(ExtractFromDelimiters($link, 'facebook.com/l.php?u=', '&'));
return ' href="'.$link.'"';
@ -75,18 +75,77 @@ class FacebookBridge extends BridgeAbstract{
return $matches[0];
};
$html = '';
$html = null;
if(isset($param['u'])) {
if(!strpos($param['u'], "/")) {
$html = file_get_html('https://facebook.com/'.urlencode($param['u']).'?_fb_noscript=1') or $this->returnError('No results for this query.', 404);
} else {
$html = file_get_html('https://facebook.com/pages/'.$param['u'].'?_fb_noscript=1') or $this->returnError('No results for this query.', 404);
//Handle captcha response sent by the viewer
if (isset($_POST['captcha_response']))
{
if (session_status() == PHP_SESSION_NONE)
session_start();
if (isset($_SESSION['captcha_fields'], $_SESSION['captcha_action']))
{
$captcha_action = $_SESSION['captcha_action'];
$captcha_fields = $_SESSION['captcha_fields'];
$captcha_fields['captcha_response'] = preg_replace("/[^a-zA-Z0-9]+/", "", $_POST['captcha_response']);
$http_options = array(
'http' => array(
'method' => 'POST',
'user_agent'=> ini_get('user_agent'),
'header'=>array("Content-type: application/x-www-form-urlencoded\r\nReferer: $captcha_action\r\nCookie: noscript=1\r\n"),
'content' => http_build_query($captcha_fields),
),
);
$context = stream_context_create($http_options);
$html = file_get_contents($captcha_action, false, $context);
if ($html === FALSE) { $this->returnError('Failed to submit captcha response back to Facebook', 500); }
unset($_SESSION['captcha_fields']);
$html = str_get_html($html);
}
} else {
$this->returnError('You must specify a Facebook username.', 400);
unset($_SESSION['captcha_fields']);
unset($_SESSION['captcha_action']);
}
//Retrieve page contents
if (is_null($html)) {
if (isset($param['u'])) {
if (!strpos($param['u'], "/")) {
$html = file_get_html('https://www.facebook.com/'.urlencode($param['u']).'?_fb_noscript=1') or $this->returnError('No results for this query.', 404);
} else {
$html = file_get_html('https://www.facebook.com/pages/'.$param['u'].'?_fb_noscript=1') or $this->returnError('No results for this query.', 404);
}
} else {
$this->returnError('You must specify a Facebook username.', 400);
}
}
//Handle captcha form?
$captcha = $html->find('div.captcha_interstitial', 0);
if (!is_null($captcha))
{
//Save form for submitting after getting captcha response
if (session_status() == PHP_SESSION_NONE)
session_start();
$captcha_fields = array();
foreach ($captcha->find('input, button') as $input)
$captcha_fields[$input->name] = $input->value;
$_SESSION['captcha_fields'] = $captcha_fields;
$_SESSION['captcha_action'] = 'https://www.facebook.com'.$captcha->find('form', 0)->action;
//Show captcha filling form to the viewer, proxying the captcha image
$img = base64_encode(file_get_contents($captcha->find('img', 0)->src));
header('HTTP/1.1 500 '.Http::getMessageForCode(500));
header('Content-Type: text/html');
die('<form method="post" action="?'.$_SERVER['QUERY_STRING'].'">'
.'<h2>Facebook captcha challenge</h2>'
.'<p>Unfortunately, rss-bridge cannot fetch the requested page.<br />'
.'Facebook wants rss-bridge to resolve the following captcha:</p>'
.'<p><img src="data:image/png;base64,'.$img.'" /></p>'
.'<p><b>Response:</b> <input name="captcha_response" placeholder="please fill in" />'
.'<input type="submit" value="Submit!" /></p>'
.'</form>');
}
//No captcha? We can carry on retrieving page contents :)
$element = $html->find('[id^=PagePostsSectionPagelet-]')[0]->children(0)->children(0);
if(isset($element)) {
@ -144,7 +203,7 @@ class FacebookBridge extends BridgeAbstract{
$thumbnail = $profilePic;
//Build and add final item
$item->uri = 'https://facebook.com'.str_replace('&amp;', '&', $post->find('abbr')[0]->parent()->getAttribute('href'));
$item->uri = 'https://facebook.com'.$post->find('abbr')[0]->parent()->getAttribute('href');
$item->thumbnailUri = $thumbnail;
$item->content = $content;
$item->title = $title;
@ -154,7 +213,12 @@ class FacebookBridge extends BridgeAbstract{
}
}
}
}
public function setDatas(array $param){
if (isset($param['captcha_response']))
unset($param['captcha_response']);
parent::setDatas($param);
}
public function getName() {