Merge pull request #624 from julienCXX/pr-curl-http-fetch
Added (and set as default) a cURL-based method for fetching HTTP content
This commit is contained in:
commit
d0d3623172
1 changed files with 150 additions and 10 deletions
|
@ -1,6 +1,7 @@
|
||||||
<?php
|
<?php
|
||||||
/**
|
/**
|
||||||
* GET an HTTP URL to retrieve its content
|
* GET an HTTP URL to retrieve its content
|
||||||
|
* Uses the cURL library or a fallback method
|
||||||
*
|
*
|
||||||
* @param string $url URL to get (http://...)
|
* @param string $url URL to get (http://...)
|
||||||
* @param int $timeout network timeout (in seconds)
|
* @param int $timeout network timeout (in seconds)
|
||||||
|
@ -20,9 +21,13 @@
|
||||||
* echo 'There was an error: '.htmlspecialchars($headers[0]);
|
* echo 'There was an error: '.htmlspecialchars($headers[0]);
|
||||||
* }
|
* }
|
||||||
*
|
*
|
||||||
* @see http://php.net/manual/en/function.file-get-contents.php
|
* @see https://secure.php.net/manual/en/ref.curl.php
|
||||||
* @see http://php.net/manual/en/function.stream-context-create.php
|
* @see https://secure.php.net/manual/en/functions.anonymous.php
|
||||||
* @see http://php.net/manual/en/function.get-headers.php
|
* @see https://secure.php.net/manual/en/function.preg-split.php
|
||||||
|
* @see https://secure.php.net/manual/en/function.explode.php
|
||||||
|
* @see http://stackoverflow.com/q/17641073
|
||||||
|
* @see http://stackoverflow.com/q/9183178
|
||||||
|
* @see http://stackoverflow.com/q/1462720
|
||||||
*/
|
*/
|
||||||
function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
|
function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
|
||||||
{
|
{
|
||||||
|
@ -33,25 +38,160 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
|
||||||
return array(array(0 => 'Invalid HTTP Url'), false);
|
return array(array(0 => 'Invalid HTTP Url'), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$userAgent =
|
||||||
|
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
|
||||||
|
. ' Gecko/20100101 Firefox/45.0';
|
||||||
|
$acceptLanguage =
|
||||||
|
substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3';
|
||||||
|
$maxRedirs = 3;
|
||||||
|
|
||||||
|
if (!function_exists('curl_init')) {
|
||||||
|
return get_http_response_fallback(
|
||||||
|
$cleanUrl,
|
||||||
|
$timeout,
|
||||||
|
$maxBytes,
|
||||||
|
$userAgent,
|
||||||
|
$acceptLanguage,
|
||||||
|
$maxRedirs
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
$ch = curl_init($cleanUrl);
|
||||||
|
if ($ch === false) {
|
||||||
|
return array(array(0 => 'curl_init() error'), false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// General cURL settings
|
||||||
|
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
|
||||||
|
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
||||||
|
curl_setopt($ch, CURLOPT_HEADER, true);
|
||||||
|
curl_setopt(
|
||||||
|
$ch,
|
||||||
|
CURLOPT_HTTPHEADER,
|
||||||
|
array('Accept-Language: ' . $acceptLanguage)
|
||||||
|
);
|
||||||
|
curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
|
||||||
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||||
|
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
|
||||||
|
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
|
||||||
|
|
||||||
|
// Max download size management
|
||||||
|
curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024);
|
||||||
|
curl_setopt($ch, CURLOPT_NOPROGRESS, false);
|
||||||
|
curl_setopt($ch, CURLOPT_PROGRESSFUNCTION,
|
||||||
|
function($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes)
|
||||||
|
{
|
||||||
|
if (version_compare(phpversion(), '5.5', '<')) {
|
||||||
|
// PHP version lower than 5.5
|
||||||
|
// Callback has 4 arguments
|
||||||
|
$downloaded = $arg1;
|
||||||
|
} else {
|
||||||
|
// Callback has 5 arguments
|
||||||
|
$downloaded = $arg2;
|
||||||
|
}
|
||||||
|
// Non-zero return stops downloading
|
||||||
|
return ($downloaded > $maxBytes) ? 1 : 0;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
$response = curl_exec($ch);
|
||||||
|
$errorNo = curl_errno($ch);
|
||||||
|
$errorStr = curl_error($ch);
|
||||||
|
$headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
|
||||||
|
curl_close($ch);
|
||||||
|
|
||||||
|
if ($response === false) {
|
||||||
|
if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) {
|
||||||
|
/*
|
||||||
|
* Workaround to match fallback method behaviour
|
||||||
|
* Removing this would require updating
|
||||||
|
* GetHttpUrlTest::testGetInvalidRemoteUrl()
|
||||||
|
*/
|
||||||
|
return array(false, false);
|
||||||
|
}
|
||||||
|
return array(array(0 => 'curl_exec() error: ' . $errorStr), false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Formatting output like the fallback method
|
||||||
|
$rawHeaders = substr($response, 0, $headSize);
|
||||||
|
|
||||||
|
// Keep only headers from latest redirection
|
||||||
|
$rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders));
|
||||||
|
$rawHeadersLastRedir = end($rawHeadersArrayRedirs);
|
||||||
|
|
||||||
|
$content = substr($response, $headSize);
|
||||||
|
$headers = array();
|
||||||
|
foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
|
||||||
|
if (empty($line) or ctype_space($line)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
$splitLine = explode(': ', $line, 2);
|
||||||
|
if (count($splitLine) > 1) {
|
||||||
|
$key = $splitLine[0];
|
||||||
|
$value = $splitLine[1];
|
||||||
|
if (array_key_exists($key, $headers)) {
|
||||||
|
if (!is_array($headers[$key])) {
|
||||||
|
$headers[$key] = array(0 => $headers[$key]);
|
||||||
|
}
|
||||||
|
$headers[$key][] = $value;
|
||||||
|
} else {
|
||||||
|
$headers[$key] = $value;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
$headers[] = $splitLine[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return array($headers, $content);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET an HTTP URL to retrieve its content (fallback method)
|
||||||
|
*
|
||||||
|
* @param string $cleanUrl URL to get (http://... valid and in ASCII form)
|
||||||
|
* @param int $timeout network timeout (in seconds)
|
||||||
|
* @param int $maxBytes maximum downloaded bytes
|
||||||
|
* @param string $userAgent "User-Agent" header
|
||||||
|
* @param string $acceptLanguage "Accept-Language" header
|
||||||
|
* @param int $maxRedr maximum amount of redirections followed
|
||||||
|
*
|
||||||
|
* @return array HTTP response headers, downloaded content
|
||||||
|
*
|
||||||
|
* Output format:
|
||||||
|
* [0] = associative array containing HTTP response headers
|
||||||
|
* [1] = URL content (downloaded data)
|
||||||
|
*
|
||||||
|
* @see http://php.net/manual/en/function.file-get-contents.php
|
||||||
|
* @see http://php.net/manual/en/function.stream-context-create.php
|
||||||
|
* @see http://php.net/manual/en/function.get-headers.php
|
||||||
|
*/
|
||||||
|
function get_http_response_fallback(
|
||||||
|
$cleanUrl,
|
||||||
|
$timeout,
|
||||||
|
$maxBytes,
|
||||||
|
$userAgent,
|
||||||
|
$acceptLanguage,
|
||||||
|
$maxRedr
|
||||||
|
) {
|
||||||
$options = array(
|
$options = array(
|
||||||
'http' => array(
|
'http' => array(
|
||||||
'method' => 'GET',
|
'method' => 'GET',
|
||||||
'timeout' => $timeout,
|
'timeout' => $timeout,
|
||||||
'user_agent' => 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
|
'user_agent' => $userAgent,
|
||||||
.' Gecko/20100101 Firefox/45.0',
|
'header' => "Accept: */*\r\n"
|
||||||
'accept_language' => substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3',
|
. 'Accept-Language: ' . $acceptLanguage
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
|
||||||
stream_context_set_default($options);
|
stream_context_set_default($options);
|
||||||
list($headers, $finalUrl) = get_redirected_headers($cleanUrl);
|
list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
|
||||||
if (! $headers || strpos($headers[0], '200 OK') === false) {
|
if (! $headers || strpos($headers[0], '200 OK') === false) {
|
||||||
$options['http']['request_fulluri'] = true;
|
$options['http']['request_fulluri'] = true;
|
||||||
stream_context_set_default($options);
|
stream_context_set_default($options);
|
||||||
list($headers, $finalUrl) = get_redirected_headers($cleanUrl);
|
list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (! $headers || strpos($headers[0], '200 OK') === false) {
|
if (! $headers) {
|
||||||
return array($headers, false);
|
return array($headers, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue