Improve metadata retrieval (performances and accuracy)
- Use dedicated function to download headers to avoid apply multiple regexps on headers - Also try to extract title from meta tags
This commit is contained in:
parent
4cf3564d28
commit
5334090be0
5 changed files with 249 additions and 189 deletions
|
@ -14,9 +14,14 @@
|
||||||
*/
|
*/
|
||||||
class HttpAccess
|
class HttpAccess
|
||||||
{
|
{
|
||||||
public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
|
public function getHttpResponse(
|
||||||
{
|
$url,
|
||||||
return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction);
|
$timeout = 30,
|
||||||
|
$maxBytes = 4194304,
|
||||||
|
$curlHeaderFunction = null,
|
||||||
|
$curlWriteFunction = null
|
||||||
|
) {
|
||||||
|
return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getCurlDownloadCallback(
|
public function getCurlDownloadCallback(
|
||||||
|
@ -24,16 +29,19 @@ public function getCurlDownloadCallback(
|
||||||
&$title,
|
&$title,
|
||||||
&$description,
|
&$description,
|
||||||
&$keywords,
|
&$keywords,
|
||||||
$retrieveDescription,
|
$retrieveDescription
|
||||||
$curlGetInfo = 'curl_getinfo'
|
|
||||||
) {
|
) {
|
||||||
return get_curl_download_callback(
|
return get_curl_download_callback(
|
||||||
$charset,
|
$charset,
|
||||||
$title,
|
$title,
|
||||||
$description,
|
$description,
|
||||||
$keywords,
|
$keywords,
|
||||||
$retrieveDescription,
|
$retrieveDescription
|
||||||
$curlGetInfo
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo')
|
||||||
|
{
|
||||||
|
return get_curl_header_callback($charset, $curlGetInfo);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,12 +6,14 @@
|
||||||
* GET an HTTP URL to retrieve its content
|
* GET an HTTP URL to retrieve its content
|
||||||
* Uses the cURL library or a fallback method
|
* Uses the cURL library or a fallback method
|
||||||
*
|
*
|
||||||
* @param string $url URL to get (http://...)
|
* @param string $url URL to get (http://...)
|
||||||
* @param int $timeout network timeout (in seconds)
|
* @param int $timeout network timeout (in seconds)
|
||||||
* @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
|
* @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
|
||||||
* @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
|
* @param callable|string $curlHeaderFunction Optional callback called during the download of headers
|
||||||
* Can be used to add download conditions on the
|
* (CURLOPT_HEADERFUNCTION)
|
||||||
* headers (response code, content type, etc.).
|
* @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
|
||||||
|
* Can be used to add download conditions on the
|
||||||
|
* headers (response code, content type, etc.).
|
||||||
*
|
*
|
||||||
* @return array HTTP response headers, downloaded content
|
* @return array HTTP response headers, downloaded content
|
||||||
*
|
*
|
||||||
|
@ -35,8 +37,13 @@
|
||||||
* @see http://stackoverflow.com/q/9183178
|
* @see http://stackoverflow.com/q/9183178
|
||||||
* @see http://stackoverflow.com/q/1462720
|
* @see http://stackoverflow.com/q/1462720
|
||||||
*/
|
*/
|
||||||
function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
|
function get_http_response(
|
||||||
{
|
$url,
|
||||||
|
$timeout = 30,
|
||||||
|
$maxBytes = 4194304,
|
||||||
|
$curlHeaderFunction = null,
|
||||||
|
$curlWriteFunction = null
|
||||||
|
) {
|
||||||
$urlObj = new Url($url);
|
$urlObj = new Url($url);
|
||||||
$cleanUrl = $urlObj->idnToAscii();
|
$cleanUrl = $urlObj->idnToAscii();
|
||||||
|
|
||||||
|
@ -70,7 +77,8 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
|
||||||
// General cURL settings
|
// General cURL settings
|
||||||
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
|
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
|
||||||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
||||||
curl_setopt($ch, CURLOPT_HEADER, true);
|
// Default header download if the $curlHeaderFunction is not defined
|
||||||
|
curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
|
||||||
curl_setopt(
|
curl_setopt(
|
||||||
$ch,
|
$ch,
|
||||||
CURLOPT_HTTPHEADER,
|
CURLOPT_HTTPHEADER,
|
||||||
|
@ -81,25 +89,21 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
|
||||||
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
|
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
|
||||||
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
|
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
|
||||||
|
|
||||||
if (is_callable($curlWriteFunction)) {
|
|
||||||
curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Max download size management
|
// Max download size management
|
||||||
curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
|
curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
|
||||||
curl_setopt($ch, CURLOPT_NOPROGRESS, false);
|
curl_setopt($ch, CURLOPT_NOPROGRESS, false);
|
||||||
|
if (is_callable($curlHeaderFunction)) {
|
||||||
|
curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
|
||||||
|
}
|
||||||
|
if (is_callable($curlWriteFunction)) {
|
||||||
|
curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
|
||||||
|
}
|
||||||
curl_setopt(
|
curl_setopt(
|
||||||
$ch,
|
$ch,
|
||||||
CURLOPT_PROGRESSFUNCTION,
|
CURLOPT_PROGRESSFUNCTION,
|
||||||
function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) {
|
function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
|
||||||
if (version_compare(phpversion(), '5.5', '<')) {
|
$downloaded = $arg2;
|
||||||
// PHP version lower than 5.5
|
|
||||||
// Callback has 4 arguments
|
|
||||||
$downloaded = $arg1;
|
|
||||||
} else {
|
|
||||||
// Callback has 5 arguments
|
|
||||||
$downloaded = $arg2;
|
|
||||||
}
|
|
||||||
// Non-zero return stops downloading
|
// Non-zero return stops downloading
|
||||||
return ($downloaded > $maxBytes) ? 1 : 0;
|
return ($downloaded > $maxBytes) ? 1 : 0;
|
||||||
}
|
}
|
||||||
|
@ -493,53 +497,22 @@ function is_https($server)
|
||||||
* Get cURL callback function for CURLOPT_WRITEFUNCTION
|
* Get cURL callback function for CURLOPT_WRITEFUNCTION
|
||||||
*
|
*
|
||||||
* @param string $charset to extract from the downloaded page (reference)
|
* @param string $charset to extract from the downloaded page (reference)
|
||||||
* @param string $title to extract from the downloaded page (reference)
|
|
||||||
* @param string $description to extract from the downloaded page (reference)
|
|
||||||
* @param string $keywords to extract from the downloaded page (reference)
|
|
||||||
* @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
|
|
||||||
* @param string $curlGetInfo Optionally overrides curl_getinfo function
|
* @param string $curlGetInfo Optionally overrides curl_getinfo function
|
||||||
*
|
*
|
||||||
* @return Closure
|
* @return Closure
|
||||||
*/
|
*/
|
||||||
function get_curl_download_callback(
|
function get_curl_header_callback(
|
||||||
&$charset,
|
&$charset,
|
||||||
&$title,
|
|
||||||
&$description,
|
|
||||||
&$keywords,
|
|
||||||
$retrieveDescription,
|
|
||||||
$curlGetInfo = 'curl_getinfo'
|
$curlGetInfo = 'curl_getinfo'
|
||||||
) {
|
) {
|
||||||
$isRedirected = false;
|
$isRedirected = false;
|
||||||
$currentChunk = 0;
|
|
||||||
$foundChunk = null;
|
|
||||||
|
|
||||||
/**
|
return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
|
||||||
* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
|
|
||||||
*
|
|
||||||
* While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
|
|
||||||
* Then we extract the title and the charset and stop the download when it's done.
|
|
||||||
*
|
|
||||||
* @param resource $ch cURL resource
|
|
||||||
* @param string $data chunk of data being downloaded
|
|
||||||
*
|
|
||||||
* @return int|bool length of $data or false if we need to stop the download
|
|
||||||
*/
|
|
||||||
return function (&$ch, $data) use (
|
|
||||||
$retrieveDescription,
|
|
||||||
$curlGetInfo,
|
|
||||||
&$charset,
|
|
||||||
&$title,
|
|
||||||
&$description,
|
|
||||||
&$keywords,
|
|
||||||
&$isRedirected,
|
|
||||||
&$currentChunk,
|
|
||||||
&$foundChunk
|
|
||||||
) {
|
|
||||||
$currentChunk++;
|
|
||||||
$responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
|
$responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
|
||||||
|
$chunkLength = strlen($data);
|
||||||
if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
|
if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
|
||||||
$isRedirected = true;
|
$isRedirected = true;
|
||||||
return strlen($data);
|
return $chunkLength;
|
||||||
}
|
}
|
||||||
if (!empty($responseCode) && $responseCode !== 200) {
|
if (!empty($responseCode) && $responseCode !== 200) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -555,6 +528,56 @@ function get_curl_download_callback(
|
||||||
if (!empty($contentType) && empty($charset)) {
|
if (!empty($contentType) && empty($charset)) {
|
||||||
$charset = header_extract_charset($contentType);
|
$charset = header_extract_charset($contentType);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return $chunkLength;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get cURL callback function for CURLOPT_WRITEFUNCTION
|
||||||
|
*
|
||||||
|
* @param string $charset to extract from the downloaded page (reference)
|
||||||
|
* @param string $title to extract from the downloaded page (reference)
|
||||||
|
* @param string $description to extract from the downloaded page (reference)
|
||||||
|
* @param string $keywords to extract from the downloaded page (reference)
|
||||||
|
* @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
|
||||||
|
* @param string $curlGetInfo Optionally overrides curl_getinfo function
|
||||||
|
*
|
||||||
|
* @return Closure
|
||||||
|
*/
|
||||||
|
function get_curl_download_callback(
|
||||||
|
&$charset,
|
||||||
|
&$title,
|
||||||
|
&$description,
|
||||||
|
&$keywords,
|
||||||
|
$retrieveDescription
|
||||||
|
) {
|
||||||
|
$currentChunk = 0;
|
||||||
|
$foundChunk = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
|
||||||
|
*
|
||||||
|
* While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
|
||||||
|
* Then we extract the title and the charset and stop the download when it's done.
|
||||||
|
*
|
||||||
|
* @param resource $ch cURL resource
|
||||||
|
* @param string $data chunk of data being downloaded
|
||||||
|
*
|
||||||
|
* @return int|bool length of $data or false if we need to stop the download
|
||||||
|
*/
|
||||||
|
return function ($ch, $data) use (
|
||||||
|
$retrieveDescription,
|
||||||
|
&$charset,
|
||||||
|
&$title,
|
||||||
|
&$description,
|
||||||
|
&$keywords,
|
||||||
|
&$currentChunk,
|
||||||
|
&$foundChunk
|
||||||
|
) {
|
||||||
|
$chunkLength = strlen($data);
|
||||||
|
$currentChunk++;
|
||||||
|
|
||||||
if (empty($charset)) {
|
if (empty($charset)) {
|
||||||
$charset = html_extract_charset($data);
|
$charset = html_extract_charset($data);
|
||||||
}
|
}
|
||||||
|
@ -562,6 +585,10 @@ function get_curl_download_callback(
|
||||||
$title = html_extract_title($data);
|
$title = html_extract_title($data);
|
||||||
$foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
|
$foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
|
||||||
}
|
}
|
||||||
|
if (empty($title)) {
|
||||||
|
$title = html_extract_tag('title', $data);
|
||||||
|
$foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
|
||||||
|
}
|
||||||
if ($retrieveDescription && empty($description)) {
|
if ($retrieveDescription && empty($description)) {
|
||||||
$description = html_extract_tag('description', $data);
|
$description = html_extract_tag('description', $data);
|
||||||
$foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
|
$foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
|
||||||
|
@ -591,6 +618,6 @@ function get_curl_download_callback(
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return strlen($data);
|
return $chunkLength;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,6 +46,7 @@ public function retrieve(string $url): array
|
||||||
$url,
|
$url,
|
||||||
$this->conf->get('general.download_timeout', 30),
|
$this->conf->get('general.download_timeout', 30),
|
||||||
$this->conf->get('general.download_max_size', 4194304),
|
$this->conf->get('general.download_max_size', 4194304),
|
||||||
|
$this->httpAccess->getCurlHeaderCallback($charset),
|
||||||
$this->httpAccess->getCurlDownloadCallback(
|
$this->httpAccess->getCurlDownloadCallback(
|
||||||
$charset,
|
$charset,
|
||||||
$title,
|
$title,
|
||||||
|
|
|
@ -216,60 +216,91 @@ public function testHtmlExtractNonExistentOgTag()
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the download callback with valid value
|
* Test the header callback with valid value
|
||||||
*/
|
*/
|
||||||
public function testCurlDownloadCallbackOk()
|
public function testCurlHeaderCallbackOk(): void
|
||||||
{
|
{
|
||||||
$callback = get_curl_download_callback(
|
$callback = get_curl_header_callback($charset, 'ut_curl_getinfo_ok');
|
||||||
$charset,
|
|
||||||
$title,
|
|
||||||
$desc,
|
|
||||||
$keywords,
|
|
||||||
false,
|
|
||||||
'ut_curl_getinfo_ok'
|
|
||||||
);
|
|
||||||
$data = [
|
$data = [
|
||||||
'HTTP/1.1 200 OK',
|
'HTTP/1.1 200 OK',
|
||||||
'Server: GitHub.com',
|
'Server: GitHub.com',
|
||||||
'Date: Sat, 28 Oct 2017 12:01:33 GMT',
|
'Date: Sat, 28 Oct 2017 12:01:33 GMT',
|
||||||
'Content-Type: text/html; charset=utf-8',
|
'Content-Type: text/html; charset=utf-8',
|
||||||
'Status: 200 OK',
|
'Status: 200 OK',
|
||||||
'end' => 'th=device-width">'
|
];
|
||||||
|
|
||||||
|
foreach ($data as $chunk) {
|
||||||
|
static::assertIsInt($callback(null, $chunk));
|
||||||
|
}
|
||||||
|
|
||||||
|
static::assertSame('utf-8', $charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the download callback with valid value
|
||||||
|
*/
|
||||||
|
public function testCurlDownloadCallbackOk(): void
|
||||||
|
{
|
||||||
|
$charset = 'utf-8';
|
||||||
|
$callback = get_curl_download_callback(
|
||||||
|
$charset,
|
||||||
|
$title,
|
||||||
|
$desc,
|
||||||
|
$keywords,
|
||||||
|
false
|
||||||
|
);
|
||||||
|
|
||||||
|
$data = [
|
||||||
|
'th=device-width">'
|
||||||
. '<title>Refactoring · GitHub</title>'
|
. '<title>Refactoring · GitHub</title>'
|
||||||
. '<link rel="search" type="application/opensea',
|
. '<link rel="search" type="application/opensea',
|
||||||
'<title>ignored</title>'
|
'<title>ignored</title>'
|
||||||
. '<meta name="description" content="desc" />'
|
. '<meta name="description" content="desc" />'
|
||||||
. '<meta name="keywords" content="key1,key2" />',
|
. '<meta name="keywords" content="key1,key2" />',
|
||||||
];
|
];
|
||||||
foreach ($data as $key => $line) {
|
|
||||||
$ignore = null;
|
foreach ($data as $chunk) {
|
||||||
$expected = $key !== 'end' ? strlen($line) : false;
|
static::assertSame(strlen($chunk), $callback(null, $chunk));
|
||||||
$this->assertEquals($expected, $callback($ignore, $line));
|
|
||||||
if ($expected === false) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
$this->assertEquals('utf-8', $charset);
|
|
||||||
$this->assertEquals('Refactoring · GitHub', $title);
|
static::assertSame('utf-8', $charset);
|
||||||
$this->assertEmpty($desc);
|
static::assertSame('Refactoring · GitHub', $title);
|
||||||
$this->assertEmpty($keywords);
|
static::assertEmpty($desc);
|
||||||
|
static::assertEmpty($keywords);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the header callback with valid value
|
||||||
|
*/
|
||||||
|
public function testCurlHeaderCallbackNoCharset(): void
|
||||||
|
{
|
||||||
|
$callback = get_curl_header_callback($charset, 'ut_curl_getinfo_no_charset');
|
||||||
|
$data = [
|
||||||
|
'HTTP/1.1 200 OK',
|
||||||
|
];
|
||||||
|
|
||||||
|
foreach ($data as $chunk) {
|
||||||
|
static::assertSame(strlen($chunk), $callback(null, $chunk));
|
||||||
|
}
|
||||||
|
|
||||||
|
static::assertFalse($charset);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the download callback with valid values and no charset
|
* Test the download callback with valid values and no charset
|
||||||
*/
|
*/
|
||||||
public function testCurlDownloadCallbackOkNoCharset()
|
public function testCurlDownloadCallbackOkNoCharset(): void
|
||||||
{
|
{
|
||||||
|
$charset = null;
|
||||||
$callback = get_curl_download_callback(
|
$callback = get_curl_download_callback(
|
||||||
$charset,
|
$charset,
|
||||||
$title,
|
$title,
|
||||||
$desc,
|
$desc,
|
||||||
$keywords,
|
$keywords,
|
||||||
false,
|
false
|
||||||
'ut_curl_getinfo_no_charset'
|
|
||||||
);
|
);
|
||||||
|
|
||||||
$data = [
|
$data = [
|
||||||
'HTTP/1.1 200 OK',
|
|
||||||
'end' => 'th=device-width">'
|
'end' => 'th=device-width">'
|
||||||
. '<title>Refactoring · GitHub</title>'
|
. '<title>Refactoring · GitHub</title>'
|
||||||
. '<link rel="search" type="application/opensea',
|
. '<link rel="search" type="application/opensea',
|
||||||
|
@ -277,10 +308,11 @@ public function testCurlDownloadCallbackOkNoCharset()
|
||||||
. '<meta name="description" content="desc" />'
|
. '<meta name="description" content="desc" />'
|
||||||
. '<meta name="keywords" content="key1,key2" />',
|
. '<meta name="keywords" content="key1,key2" />',
|
||||||
];
|
];
|
||||||
foreach ($data as $key => $line) {
|
|
||||||
$ignore = null;
|
foreach ($data as $chunk) {
|
||||||
$this->assertEquals(strlen($line), $callback($ignore, $line));
|
static::assertSame(strlen($chunk), $callback(null, $chunk));
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->assertEmpty($charset);
|
$this->assertEmpty($charset);
|
||||||
$this->assertEquals('Refactoring · GitHub', $title);
|
$this->assertEquals('Refactoring · GitHub', $title);
|
||||||
$this->assertEmpty($desc);
|
$this->assertEmpty($desc);
|
||||||
|
@ -290,18 +322,18 @@ public function testCurlDownloadCallbackOkNoCharset()
|
||||||
/**
|
/**
|
||||||
* Test the download callback with valid values and no charset
|
* Test the download callback with valid values and no charset
|
||||||
*/
|
*/
|
||||||
public function testCurlDownloadCallbackOkHtmlCharset()
|
public function testCurlDownloadCallbackOkHtmlCharset(): void
|
||||||
{
|
{
|
||||||
|
$charset = null;
|
||||||
$callback = get_curl_download_callback(
|
$callback = get_curl_download_callback(
|
||||||
$charset,
|
$charset,
|
||||||
$title,
|
$title,
|
||||||
$desc,
|
$desc,
|
||||||
$keywords,
|
$keywords,
|
||||||
false,
|
false
|
||||||
'ut_curl_getinfo_no_charset'
|
|
||||||
);
|
);
|
||||||
|
|
||||||
$data = [
|
$data = [
|
||||||
'HTTP/1.1 200 OK',
|
|
||||||
'<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
|
'<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
|
||||||
'end' => 'th=device-width">'
|
'end' => 'th=device-width">'
|
||||||
. '<title>Refactoring · GitHub</title>'
|
. '<title>Refactoring · GitHub</title>'
|
||||||
|
@ -310,14 +342,10 @@ public function testCurlDownloadCallbackOkHtmlCharset()
|
||||||
. '<meta name="description" content="desc" />'
|
. '<meta name="description" content="desc" />'
|
||||||
. '<meta name="keywords" content="key1,key2" />',
|
. '<meta name="keywords" content="key1,key2" />',
|
||||||
];
|
];
|
||||||
foreach ($data as $key => $line) {
|
foreach ($data as $chunk) {
|
||||||
$ignore = null;
|
static::assertSame(strlen($chunk), $callback(null, $chunk));
|
||||||
$expected = $key !== 'end' ? strlen($line) : false;
|
|
||||||
$this->assertEquals($expected, $callback($ignore, $line));
|
|
||||||
if ($expected === false) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->assertEquals('utf-8', $charset);
|
$this->assertEquals('utf-8', $charset);
|
||||||
$this->assertEquals('Refactoring · GitHub', $title);
|
$this->assertEquals('Refactoring · GitHub', $title);
|
||||||
$this->assertEmpty($desc);
|
$this->assertEmpty($desc);
|
||||||
|
@ -327,25 +355,26 @@ public function testCurlDownloadCallbackOkHtmlCharset()
|
||||||
/**
|
/**
|
||||||
* Test the download callback with valid values and no title
|
* Test the download callback with valid values and no title
|
||||||
*/
|
*/
|
||||||
public function testCurlDownloadCallbackOkNoTitle()
|
public function testCurlDownloadCallbackOkNoTitle(): void
|
||||||
{
|
{
|
||||||
|
$charset = 'utf-8';
|
||||||
$callback = get_curl_download_callback(
|
$callback = get_curl_download_callback(
|
||||||
$charset,
|
$charset,
|
||||||
$title,
|
$title,
|
||||||
$desc,
|
$desc,
|
||||||
$keywords,
|
$keywords,
|
||||||
false,
|
false
|
||||||
'ut_curl_getinfo_ok'
|
|
||||||
);
|
);
|
||||||
|
|
||||||
$data = [
|
$data = [
|
||||||
'HTTP/1.1 200 OK',
|
|
||||||
'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
|
'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
|
||||||
'ignored',
|
'ignored',
|
||||||
];
|
];
|
||||||
foreach ($data as $key => $line) {
|
|
||||||
$ignore = null;
|
foreach ($data as $chunk) {
|
||||||
$this->assertEquals(strlen($line), $callback($ignore, $line));
|
static::assertSame(strlen($chunk), $callback(null, $chunk));
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->assertEquals('utf-8', $charset);
|
$this->assertEquals('utf-8', $charset);
|
||||||
$this->assertEmpty($title);
|
$this->assertEmpty($title);
|
||||||
$this->assertEmpty($desc);
|
$this->assertEmpty($desc);
|
||||||
|
@ -353,81 +382,55 @@ public function testCurlDownloadCallbackOkNoTitle()
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the download callback with an invalid content type.
|
* Test the header callback with an invalid content type.
|
||||||
*/
|
*/
|
||||||
public function testCurlDownloadCallbackInvalidContentType()
|
public function testCurlHeaderCallbackInvalidContentType(): void
|
||||||
{
|
{
|
||||||
$callback = get_curl_download_callback(
|
$callback = get_curl_header_callback($charset, 'ut_curl_getinfo_ct_ko');
|
||||||
$charset,
|
$data = [
|
||||||
$title,
|
'HTTP/1.1 200 OK',
|
||||||
$desc,
|
];
|
||||||
$keywords,
|
|
||||||
false,
|
static::assertFalse($callback(null, $data[0]));
|
||||||
'ut_curl_getinfo_ct_ko'
|
static::assertNull($charset);
|
||||||
);
|
|
||||||
$ignore = null;
|
|
||||||
$this->assertFalse($callback($ignore, ''));
|
|
||||||
$this->assertEmpty($charset);
|
|
||||||
$this->assertEmpty($title);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the download callback with an invalid response code.
|
* Test the header callback with an invalid response code.
|
||||||
*/
|
*/
|
||||||
public function testCurlDownloadCallbackInvalidResponseCode()
|
public function testCurlHeaderCallbackInvalidResponseCode(): void
|
||||||
{
|
{
|
||||||
$callback = $callback = get_curl_download_callback(
|
$callback = get_curl_header_callback($charset, 'ut_curl_getinfo_rc_ko');
|
||||||
$charset,
|
|
||||||
$title,
|
static::assertFalse($callback(null, ''));
|
||||||
$desc,
|
static::assertNull($charset);
|
||||||
$keywords,
|
|
||||||
false,
|
|
||||||
'ut_curl_getinfo_rc_ko'
|
|
||||||
);
|
|
||||||
$ignore = null;
|
|
||||||
$this->assertFalse($callback($ignore, ''));
|
|
||||||
$this->assertEmpty($charset);
|
|
||||||
$this->assertEmpty($title);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the download callback with an invalid content type and response code.
|
* Test the header callback with an invalid content type and response code.
|
||||||
*/
|
*/
|
||||||
public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode()
|
public function testCurlHeaderCallbackInvalidContentTypeAndResponseCode(): void
|
||||||
{
|
{
|
||||||
$callback = $callback = get_curl_download_callback(
|
$callback = get_curl_header_callback($charset, 'ut_curl_getinfo_rs_ct_ko');
|
||||||
$charset,
|
|
||||||
$title,
|
static::assertFalse($callback(null, ''));
|
||||||
$desc,
|
static::assertNull($charset);
|
||||||
$keywords,
|
|
||||||
false,
|
|
||||||
'ut_curl_getinfo_rs_ct_ko'
|
|
||||||
);
|
|
||||||
$ignore = null;
|
|
||||||
$this->assertFalse($callback($ignore, ''));
|
|
||||||
$this->assertEmpty($charset);
|
|
||||||
$this->assertEmpty($title);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the download callback with valid value, and retrieve_description option enabled.
|
* Test the download callback with valid value, and retrieve_description option enabled.
|
||||||
*/
|
*/
|
||||||
public function testCurlDownloadCallbackOkWithDesc()
|
public function testCurlDownloadCallbackOkWithDesc(): void
|
||||||
{
|
{
|
||||||
|
$charset = 'utf-8';
|
||||||
$callback = get_curl_download_callback(
|
$callback = get_curl_download_callback(
|
||||||
$charset,
|
$charset,
|
||||||
$title,
|
$title,
|
||||||
$desc,
|
$desc,
|
||||||
$keywords,
|
$keywords,
|
||||||
true,
|
true
|
||||||
'ut_curl_getinfo_ok'
|
|
||||||
);
|
);
|
||||||
$data = [
|
$data = [
|
||||||
'HTTP/1.1 200 OK',
|
|
||||||
'Server: GitHub.com',
|
|
||||||
'Date: Sat, 28 Oct 2017 12:01:33 GMT',
|
|
||||||
'Content-Type: text/html; charset=utf-8',
|
|
||||||
'Status: 200 OK',
|
|
||||||
'th=device-width">'
|
'th=device-width">'
|
||||||
. '<title>Refactoring · GitHub</title>'
|
. '<title>Refactoring · GitHub</title>'
|
||||||
. '<link rel="search" type="application/opensea',
|
. '<link rel="search" type="application/opensea',
|
||||||
|
@ -435,14 +438,11 @@ public function testCurlDownloadCallbackOkWithDesc()
|
||||||
. '<meta name="description" content="link desc" />'
|
. '<meta name="description" content="link desc" />'
|
||||||
. '<meta name="keywords" content="key1,key2" />',
|
. '<meta name="keywords" content="key1,key2" />',
|
||||||
];
|
];
|
||||||
foreach ($data as $key => $line) {
|
|
||||||
$ignore = null;
|
foreach ($data as $chunk) {
|
||||||
$expected = $key !== 'end' ? strlen($line) : false;
|
static::assertSame(strlen($chunk), $callback(null, $chunk));
|
||||||
$this->assertEquals($expected, $callback($ignore, $line));
|
|
||||||
if ($expected === false) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->assertEquals('utf-8', $charset);
|
$this->assertEquals('utf-8', $charset);
|
||||||
$this->assertEquals('Refactoring · GitHub', $title);
|
$this->assertEquals('Refactoring · GitHub', $title);
|
||||||
$this->assertEquals('link desc', $desc);
|
$this->assertEquals('link desc', $desc);
|
||||||
|
@ -453,8 +453,9 @@ public function testCurlDownloadCallbackOkWithDesc()
|
||||||
* Test the download callback with valid value, and retrieve_description option enabled,
|
* Test the download callback with valid value, and retrieve_description option enabled,
|
||||||
* but no desc or keyword defined in the page.
|
* but no desc or keyword defined in the page.
|
||||||
*/
|
*/
|
||||||
public function testCurlDownloadCallbackOkWithDescNotFound()
|
public function testCurlDownloadCallbackOkWithDescNotFound(): void
|
||||||
{
|
{
|
||||||
|
$charset = 'utf-8';
|
||||||
$callback = get_curl_download_callback(
|
$callback = get_curl_download_callback(
|
||||||
$charset,
|
$charset,
|
||||||
$title,
|
$title,
|
||||||
|
@ -464,24 +465,16 @@ public function testCurlDownloadCallbackOkWithDescNotFound()
|
||||||
'ut_curl_getinfo_ok'
|
'ut_curl_getinfo_ok'
|
||||||
);
|
);
|
||||||
$data = [
|
$data = [
|
||||||
'HTTP/1.1 200 OK',
|
|
||||||
'Server: GitHub.com',
|
|
||||||
'Date: Sat, 28 Oct 2017 12:01:33 GMT',
|
|
||||||
'Content-Type: text/html; charset=utf-8',
|
|
||||||
'Status: 200 OK',
|
|
||||||
'th=device-width">'
|
'th=device-width">'
|
||||||
. '<title>Refactoring · GitHub</title>'
|
. '<title>Refactoring · GitHub</title>'
|
||||||
. '<link rel="search" type="application/opensea',
|
. '<link rel="search" type="application/opensea',
|
||||||
'end' => '<title>ignored</title>',
|
'end' => '<title>ignored</title>',
|
||||||
];
|
];
|
||||||
foreach ($data as $key => $line) {
|
|
||||||
$ignore = null;
|
foreach ($data as $chunk) {
|
||||||
$expected = $key !== 'end' ? strlen($line) : false;
|
static::assertSame(strlen($chunk), $callback(null, $chunk));
|
||||||
$this->assertEquals($expected, $callback($ignore, $line));
|
|
||||||
if ($expected === false) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->assertEquals('utf-8', $charset);
|
$this->assertEquals('utf-8', $charset);
|
||||||
$this->assertEquals('Refactoring · GitHub', $title);
|
$this->assertEquals('Refactoring · GitHub', $title);
|
||||||
$this->assertEmpty($desc);
|
$this->assertEmpty($desc);
|
||||||
|
|
|
@ -38,6 +38,7 @@ public function testFullRetrieval(): void
|
||||||
$remoteTitle = 'Remote Title ';
|
$remoteTitle = 'Remote Title ';
|
||||||
$remoteDesc = 'Sometimes the meta description is relevant.';
|
$remoteDesc = 'Sometimes the meta description is relevant.';
|
||||||
$remoteTags = 'abc def';
|
$remoteTags = 'abc def';
|
||||||
|
$remoteCharset = 'utf-8';
|
||||||
|
|
||||||
$expectedResult = [
|
$expectedResult = [
|
||||||
'title' => $remoteTitle,
|
'title' => $remoteTitle,
|
||||||
|
@ -45,11 +46,28 @@ public function testFullRetrieval(): void
|
||||||
'tags' => $remoteTags,
|
'tags' => $remoteTags,
|
||||||
];
|
];
|
||||||
|
|
||||||
|
$this->httpAccess
|
||||||
|
->expects(static::once())
|
||||||
|
->method('getCurlHeaderCallback')
|
||||||
|
->willReturnCallback(
|
||||||
|
function (&$charset) use (
|
||||||
|
$remoteCharset
|
||||||
|
): callable {
|
||||||
|
return function () use (
|
||||||
|
&$charset,
|
||||||
|
$remoteCharset
|
||||||
|
): void {
|
||||||
|
$charset = $remoteCharset;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
)
|
||||||
|
;
|
||||||
$this->httpAccess
|
$this->httpAccess
|
||||||
->expects(static::once())
|
->expects(static::once())
|
||||||
->method('getCurlDownloadCallback')
|
->method('getCurlDownloadCallback')
|
||||||
->willReturnCallback(
|
->willReturnCallback(
|
||||||
function (&$charset, &$title, &$description, &$tags) use (
|
function (&$charset, &$title, &$description, &$tags) use (
|
||||||
|
$remoteCharset,
|
||||||
$remoteTitle,
|
$remoteTitle,
|
||||||
$remoteDesc,
|
$remoteDesc,
|
||||||
$remoteTags
|
$remoteTags
|
||||||
|
@ -59,11 +77,13 @@ function (&$charset, &$title, &$description, &$tags) use (
|
||||||
&$title,
|
&$title,
|
||||||
&$description,
|
&$description,
|
||||||
&$tags,
|
&$tags,
|
||||||
|
$remoteCharset,
|
||||||
$remoteTitle,
|
$remoteTitle,
|
||||||
$remoteDesc,
|
$remoteDesc,
|
||||||
$remoteTags
|
$remoteTags
|
||||||
): void {
|
): void {
|
||||||
$charset = 'ISO-8859-1';
|
static::assertSame($remoteCharset, $charset);
|
||||||
|
|
||||||
$title = $remoteTitle;
|
$title = $remoteTitle;
|
||||||
$description = $remoteDesc;
|
$description = $remoteDesc;
|
||||||
$tags = $remoteTags;
|
$tags = $remoteTags;
|
||||||
|
@ -75,8 +95,9 @@ function (&$charset, &$title, &$description, &$tags) use (
|
||||||
->expects(static::once())
|
->expects(static::once())
|
||||||
->method('getHttpResponse')
|
->method('getHttpResponse')
|
||||||
->with($url, 30, 4194304)
|
->with($url, 30, 4194304)
|
||||||
->willReturnCallback(function($url, $timeout, $maxBytes, $callback): void {
|
->willReturnCallback(function($url, $timeout, $maxBytes, $headerCallback, $dlCallback): void {
|
||||||
$callback();
|
$headerCallback();
|
||||||
|
$dlCallback();
|
||||||
})
|
})
|
||||||
;
|
;
|
||||||
|
|
||||||
|
@ -102,8 +123,17 @@ public function testEmptyRetrieval(): void
|
||||||
->expects(static::once())
|
->expects(static::once())
|
||||||
->method('getCurlDownloadCallback')
|
->method('getCurlDownloadCallback')
|
||||||
->willReturnCallback(
|
->willReturnCallback(
|
||||||
function (&$charset, &$title, &$description, &$tags): callable {
|
function (): callable {
|
||||||
return function () use (&$charset, &$title, &$description, &$tags): void {};
|
return function (): void {};
|
||||||
|
}
|
||||||
|
)
|
||||||
|
;
|
||||||
|
$this->httpAccess
|
||||||
|
->expects(static::once())
|
||||||
|
->method('getCurlHeaderCallback')
|
||||||
|
->willReturnCallback(
|
||||||
|
function (): callable {
|
||||||
|
return function (): void {};
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
;
|
;
|
||||||
|
@ -111,8 +141,9 @@ function (&$charset, &$title, &$description, &$tags): callable {
|
||||||
->expects(static::once())
|
->expects(static::once())
|
||||||
->method('getHttpResponse')
|
->method('getHttpResponse')
|
||||||
->with($url, 30, 4194304)
|
->with($url, 30, 4194304)
|
||||||
->willReturnCallback(function($url, $timeout, $maxBytes, $callback): void {
|
->willReturnCallback(function($url, $timeout, $maxBytes, $headerCallback, $dlCallback): void {
|
||||||
$callback();
|
$headerCallback();
|
||||||
|
$dlCallback();
|
||||||
})
|
})
|
||||||
;
|
;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue