Merge pull request #624 from julienCXX/pr-curl-http-fetch

Added (and set as default) a cURL-based method for fetching HTTP content
This commit is contained in:
Arthur 2016-08-09 13:15:19 +02:00 committed by GitHub
commit d0d3623172

View file

@ -1,6 +1,7 @@
<?php <?php
/** /**
* GET an HTTP URL to retrieve its content * GET an HTTP URL to retrieve its content
* Uses the cURL library or a fallback method
* *
* @param string $url URL to get (http://...) * @param string $url URL to get (http://...)
* @param int $timeout network timeout (in seconds) * @param int $timeout network timeout (in seconds)
@ -20,9 +21,13 @@
* echo 'There was an error: '.htmlspecialchars($headers[0]); * echo 'There was an error: '.htmlspecialchars($headers[0]);
* } * }
* *
* @see http://php.net/manual/en/function.file-get-contents.php * @see https://secure.php.net/manual/en/ref.curl.php
* @see http://php.net/manual/en/function.stream-context-create.php * @see https://secure.php.net/manual/en/functions.anonymous.php
* @see http://php.net/manual/en/function.get-headers.php * @see https://secure.php.net/manual/en/function.preg-split.php
* @see https://secure.php.net/manual/en/function.explode.php
* @see http://stackoverflow.com/q/17641073
* @see http://stackoverflow.com/q/9183178
* @see http://stackoverflow.com/q/1462720
*/ */
function get_http_response($url, $timeout = 30, $maxBytes = 4194304) function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
{ {
@ -33,25 +38,160 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
return array(array(0 => 'Invalid HTTP Url'), false); return array(array(0 => 'Invalid HTTP Url'), false);
} }
$userAgent =
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
. ' Gecko/20100101 Firefox/45.0';
$acceptLanguage =
substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3';
$maxRedirs = 3;
if (!function_exists('curl_init')) {
return get_http_response_fallback(
$cleanUrl,
$timeout,
$maxBytes,
$userAgent,
$acceptLanguage,
$maxRedirs
);
}
$ch = curl_init($cleanUrl);
if ($ch === false) {
return array(array(0 => 'curl_init() error'), false);
}
// General cURL settings
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt(
$ch,
CURLOPT_HTTPHEADER,
array('Accept-Language: ' . $acceptLanguage)
);
curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
// Max download size management
curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024);
curl_setopt($ch, CURLOPT_NOPROGRESS, false);
curl_setopt($ch, CURLOPT_PROGRESSFUNCTION,
function($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes)
{
if (version_compare(phpversion(), '5.5', '<')) {
// PHP version lower than 5.5
// Callback has 4 arguments
$downloaded = $arg1;
} else {
// Callback has 5 arguments
$downloaded = $arg2;
}
// Non-zero return stops downloading
return ($downloaded > $maxBytes) ? 1 : 0;
}
);
$response = curl_exec($ch);
$errorNo = curl_errno($ch);
$errorStr = curl_error($ch);
$headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
curl_close($ch);
if ($response === false) {
if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) {
/*
* Workaround to match fallback method behaviour
* Removing this would require updating
* GetHttpUrlTest::testGetInvalidRemoteUrl()
*/
return array(false, false);
}
return array(array(0 => 'curl_exec() error: ' . $errorStr), false);
}
// Formatting output like the fallback method
$rawHeaders = substr($response, 0, $headSize);
// Keep only headers from latest redirection
$rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders));
$rawHeadersLastRedir = end($rawHeadersArrayRedirs);
$content = substr($response, $headSize);
$headers = array();
foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
if (empty($line) or ctype_space($line)) {
continue;
}
$splitLine = explode(': ', $line, 2);
if (count($splitLine) > 1) {
$key = $splitLine[0];
$value = $splitLine[1];
if (array_key_exists($key, $headers)) {
if (!is_array($headers[$key])) {
$headers[$key] = array(0 => $headers[$key]);
}
$headers[$key][] = $value;
} else {
$headers[$key] = $value;
}
} else {
$headers[] = $splitLine[0];
}
}
return array($headers, $content);
}
/**
* GET an HTTP URL to retrieve its content (fallback method)
*
* @param string $cleanUrl URL to get (http://... valid and in ASCII form)
* @param int $timeout network timeout (in seconds)
* @param int $maxBytes maximum downloaded bytes
* @param string $userAgent "User-Agent" header
* @param string $acceptLanguage "Accept-Language" header
* @param int $maxRedr maximum amount of redirections followed
*
* @return array HTTP response headers, downloaded content
*
* Output format:
* [0] = associative array containing HTTP response headers
* [1] = URL content (downloaded data)
*
* @see http://php.net/manual/en/function.file-get-contents.php
* @see http://php.net/manual/en/function.stream-context-create.php
* @see http://php.net/manual/en/function.get-headers.php
*/
function get_http_response_fallback(
$cleanUrl,
$timeout,
$maxBytes,
$userAgent,
$acceptLanguage,
$maxRedr
) {
$options = array( $options = array(
'http' => array( 'http' => array(
'method' => 'GET', 'method' => 'GET',
'timeout' => $timeout, 'timeout' => $timeout,
'user_agent' => 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)' 'user_agent' => $userAgent,
.' Gecko/20100101 Firefox/45.0', 'header' => "Accept: */*\r\n"
'accept_language' => substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3', . 'Accept-Language: ' . $acceptLanguage
) )
); );
stream_context_set_default($options); stream_context_set_default($options);
list($headers, $finalUrl) = get_redirected_headers($cleanUrl); list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
if (! $headers || strpos($headers[0], '200 OK') === false) { if (! $headers || strpos($headers[0], '200 OK') === false) {
$options['http']['request_fulluri'] = true; $options['http']['request_fulluri'] = true;
stream_context_set_default($options); stream_context_set_default($options);
list($headers, $finalUrl) = get_redirected_headers($cleanUrl); list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
} }
if (! $headers || strpos($headers[0], '200 OK') === false) { if (! $headers) {
return array($headers, false); return array($headers, false);
} }