Improve metadata retrieval (performances and accuracy)

- Use dedicated function to download headers to avoid apply multiple regexps on headers
  - Also try to extract title from meta tags
This commit is contained in:
ArthurHoaro 2020-10-15 11:20:33 +02:00
parent 4cf3564d28
commit 5334090be0
5 changed files with 249 additions and 189 deletions

View file

@ -14,9 +14,14 @@
*/ */
class HttpAccess class HttpAccess
{ {
public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) public function getHttpResponse(
{ $url,
return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction); $timeout = 30,
$maxBytes = 4194304,
$curlHeaderFunction = null,
$curlWriteFunction = null
) {
return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction);
} }
public function getCurlDownloadCallback( public function getCurlDownloadCallback(
@ -24,16 +29,19 @@ public function getCurlDownloadCallback(
&$title, &$title,
&$description, &$description,
&$keywords, &$keywords,
$retrieveDescription, $retrieveDescription
$curlGetInfo = 'curl_getinfo'
) { ) {
return get_curl_download_callback( return get_curl_download_callback(
$charset, $charset,
$title, $title,
$description, $description,
$keywords, $keywords,
$retrieveDescription, $retrieveDescription
$curlGetInfo
); );
} }
public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo')
{
return get_curl_header_callback($charset, $curlGetInfo);
}
} }

View file

@ -9,6 +9,8 @@
* @param string $url URL to get (http://...) * @param string $url URL to get (http://...)
* @param int $timeout network timeout (in seconds) * @param int $timeout network timeout (in seconds)
* @param int $maxBytes maximum downloaded bytes (default: 4 MiB) * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
* @param callable|string $curlHeaderFunction Optional callback called during the download of headers
* (CURLOPT_HEADERFUNCTION)
* @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
* Can be used to add download conditions on the * Can be used to add download conditions on the
* headers (response code, content type, etc.). * headers (response code, content type, etc.).
@ -35,8 +37,13 @@
* @see http://stackoverflow.com/q/9183178 * @see http://stackoverflow.com/q/9183178
* @see http://stackoverflow.com/q/1462720 * @see http://stackoverflow.com/q/1462720
*/ */
function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) function get_http_response(
{ $url,
$timeout = 30,
$maxBytes = 4194304,
$curlHeaderFunction = null,
$curlWriteFunction = null
) {
$urlObj = new Url($url); $urlObj = new Url($url);
$cleanUrl = $urlObj->idnToAscii(); $cleanUrl = $urlObj->idnToAscii();
@ -70,7 +77,8 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
// General cURL settings // General cURL settings
curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_HEADER, true); // Default header download if the $curlHeaderFunction is not defined
curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
curl_setopt( curl_setopt(
$ch, $ch,
CURLOPT_HTTPHEADER, CURLOPT_HTTPHEADER,
@ -81,25 +89,21 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
if (is_callable($curlWriteFunction)) {
curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
}
// Max download size management // Max download size management
curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
curl_setopt($ch, CURLOPT_NOPROGRESS, false); curl_setopt($ch, CURLOPT_NOPROGRESS, false);
if (is_callable($curlHeaderFunction)) {
curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
}
if (is_callable($curlWriteFunction)) {
curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
}
curl_setopt( curl_setopt(
$ch, $ch,
CURLOPT_PROGRESSFUNCTION, CURLOPT_PROGRESSFUNCTION,
function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) { function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
if (version_compare(phpversion(), '5.5', '<')) {
// PHP version lower than 5.5
// Callback has 4 arguments
$downloaded = $arg1;
} else {
// Callback has 5 arguments
$downloaded = $arg2; $downloaded = $arg2;
}
// Non-zero return stops downloading // Non-zero return stops downloading
return ($downloaded > $maxBytes) ? 1 : 0; return ($downloaded > $maxBytes) ? 1 : 0;
} }
@ -493,53 +497,22 @@ function is_https($server)
* Get cURL callback function for CURLOPT_WRITEFUNCTION * Get cURL callback function for CURLOPT_WRITEFUNCTION
* *
* @param string $charset to extract from the downloaded page (reference) * @param string $charset to extract from the downloaded page (reference)
* @param string $title to extract from the downloaded page (reference)
* @param string $description to extract from the downloaded page (reference)
* @param string $keywords to extract from the downloaded page (reference)
* @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
* @param string $curlGetInfo Optionally overrides curl_getinfo function * @param string $curlGetInfo Optionally overrides curl_getinfo function
* *
* @return Closure * @return Closure
*/ */
function get_curl_download_callback( function get_curl_header_callback(
&$charset, &$charset,
&$title,
&$description,
&$keywords,
$retrieveDescription,
$curlGetInfo = 'curl_getinfo' $curlGetInfo = 'curl_getinfo'
) { ) {
$isRedirected = false; $isRedirected = false;
$currentChunk = 0;
$foundChunk = null;
/** return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
*
* While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
* Then we extract the title and the charset and stop the download when it's done.
*
* @param resource $ch cURL resource
* @param string $data chunk of data being downloaded
*
* @return int|bool length of $data or false if we need to stop the download
*/
return function (&$ch, $data) use (
$retrieveDescription,
$curlGetInfo,
&$charset,
&$title,
&$description,
&$keywords,
&$isRedirected,
&$currentChunk,
&$foundChunk
) {
$currentChunk++;
$responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
$chunkLength = strlen($data);
if (!empty($responseCode) && in_array($responseCode, [301, 302])) { if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
$isRedirected = true; $isRedirected = true;
return strlen($data); return $chunkLength;
} }
if (!empty($responseCode) && $responseCode !== 200) { if (!empty($responseCode) && $responseCode !== 200) {
return false; return false;
@ -555,6 +528,56 @@ function get_curl_download_callback(
if (!empty($contentType) && empty($charset)) { if (!empty($contentType) && empty($charset)) {
$charset = header_extract_charset($contentType); $charset = header_extract_charset($contentType);
} }
return $chunkLength;
};
}
/**
* Get cURL callback function for CURLOPT_WRITEFUNCTION
*
* @param string $charset to extract from the downloaded page (reference)
* @param string $title to extract from the downloaded page (reference)
* @param string $description to extract from the downloaded page (reference)
* @param string $keywords to extract from the downloaded page (reference)
* @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
* @param string $curlGetInfo Optionally overrides curl_getinfo function
*
* @return Closure
*/
function get_curl_download_callback(
&$charset,
&$title,
&$description,
&$keywords,
$retrieveDescription
) {
$currentChunk = 0;
$foundChunk = null;
/**
* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
*
* While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
* Then we extract the title and the charset and stop the download when it's done.
*
* @param resource $ch cURL resource
* @param string $data chunk of data being downloaded
*
* @return int|bool length of $data or false if we need to stop the download
*/
return function ($ch, $data) use (
$retrieveDescription,
&$charset,
&$title,
&$description,
&$keywords,
&$currentChunk,
&$foundChunk
) {
$chunkLength = strlen($data);
$currentChunk++;
if (empty($charset)) { if (empty($charset)) {
$charset = html_extract_charset($data); $charset = html_extract_charset($data);
} }
@ -562,6 +585,10 @@ function get_curl_download_callback(
$title = html_extract_title($data); $title = html_extract_title($data);
$foundChunk = ! empty($title) ? $currentChunk : $foundChunk; $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
} }
if (empty($title)) {
$title = html_extract_tag('title', $data);
$foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
}
if ($retrieveDescription && empty($description)) { if ($retrieveDescription && empty($description)) {
$description = html_extract_tag('description', $data); $description = html_extract_tag('description', $data);
$foundChunk = ! empty($description) ? $currentChunk : $foundChunk; $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
@ -591,6 +618,6 @@ function get_curl_download_callback(
return false; return false;
} }
return strlen($data); return $chunkLength;
}; };
} }

View file

@ -46,6 +46,7 @@ public function retrieve(string $url): array
$url, $url,
$this->conf->get('general.download_timeout', 30), $this->conf->get('general.download_timeout', 30),
$this->conf->get('general.download_max_size', 4194304), $this->conf->get('general.download_max_size', 4194304),
$this->httpAccess->getCurlHeaderCallback($charset),
$this->httpAccess->getCurlDownloadCallback( $this->httpAccess->getCurlDownloadCallback(
$charset, $charset,
$title, $title,

View file

@ -216,60 +216,91 @@ public function testHtmlExtractNonExistentOgTag()
} }
/** /**
* Test the download callback with valid value * Test the header callback with valid value
*/ */
public function testCurlDownloadCallbackOk() public function testCurlHeaderCallbackOk(): void
{ {
$callback = get_curl_download_callback( $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_ok');
$charset,
$title,
$desc,
$keywords,
false,
'ut_curl_getinfo_ok'
);
$data = [ $data = [
'HTTP/1.1 200 OK', 'HTTP/1.1 200 OK',
'Server: GitHub.com', 'Server: GitHub.com',
'Date: Sat, 28 Oct 2017 12:01:33 GMT', 'Date: Sat, 28 Oct 2017 12:01:33 GMT',
'Content-Type: text/html; charset=utf-8', 'Content-Type: text/html; charset=utf-8',
'Status: 200 OK', 'Status: 200 OK',
'end' => 'th=device-width">'
. '<title>Refactoring · GitHub</title>'
. '<link rel="search" type="application/opensea',
'<title>ignored</title>'
. '<meta name="description" content="desc" />'
. '<meta name="keywords" content="key1,key2" />',
]; ];
foreach ($data as $key => $line) {
$ignore = null; foreach ($data as $chunk) {
$expected = $key !== 'end' ? strlen($line) : false; static::assertIsInt($callback(null, $chunk));
$this->assertEquals($expected, $callback($ignore, $line));
if ($expected === false) {
break;
} }
}
$this->assertEquals('utf-8', $charset); static::assertSame('utf-8', $charset);
$this->assertEquals('Refactoring · GitHub', $title);
$this->assertEmpty($desc);
$this->assertEmpty($keywords);
} }
/** /**
* Test the download callback with valid values and no charset * Test the download callback with valid value
*/ */
public function testCurlDownloadCallbackOkNoCharset() public function testCurlDownloadCallbackOk(): void
{ {
$charset = 'utf-8';
$callback = get_curl_download_callback( $callback = get_curl_download_callback(
$charset, $charset,
$title, $title,
$desc, $desc,
$keywords, $keywords,
false, false
'ut_curl_getinfo_no_charset'
); );
$data = [
'th=device-width">'
. '<title>Refactoring · GitHub</title>'
. '<link rel="search" type="application/opensea',
'<title>ignored</title>'
. '<meta name="description" content="desc" />'
. '<meta name="keywords" content="key1,key2" />',
];
foreach ($data as $chunk) {
static::assertSame(strlen($chunk), $callback(null, $chunk));
}
static::assertSame('utf-8', $charset);
static::assertSame('Refactoring · GitHub', $title);
static::assertEmpty($desc);
static::assertEmpty($keywords);
}
/**
* Test the header callback with valid value
*/
public function testCurlHeaderCallbackNoCharset(): void
{
$callback = get_curl_header_callback($charset, 'ut_curl_getinfo_no_charset');
$data = [ $data = [
'HTTP/1.1 200 OK', 'HTTP/1.1 200 OK',
];
foreach ($data as $chunk) {
static::assertSame(strlen($chunk), $callback(null, $chunk));
}
static::assertFalse($charset);
}
/**
* Test the download callback with valid values and no charset
*/
public function testCurlDownloadCallbackOkNoCharset(): void
{
$charset = null;
$callback = get_curl_download_callback(
$charset,
$title,
$desc,
$keywords,
false
);
$data = [
'end' => 'th=device-width">' 'end' => 'th=device-width">'
. '<title>Refactoring · GitHub</title>' . '<title>Refactoring · GitHub</title>'
. '<link rel="search" type="application/opensea', . '<link rel="search" type="application/opensea',
@ -277,10 +308,11 @@ public function testCurlDownloadCallbackOkNoCharset()
. '<meta name="description" content="desc" />' . '<meta name="description" content="desc" />'
. '<meta name="keywords" content="key1,key2" />', . '<meta name="keywords" content="key1,key2" />',
]; ];
foreach ($data as $key => $line) {
$ignore = null; foreach ($data as $chunk) {
$this->assertEquals(strlen($line), $callback($ignore, $line)); static::assertSame(strlen($chunk), $callback(null, $chunk));
} }
$this->assertEmpty($charset); $this->assertEmpty($charset);
$this->assertEquals('Refactoring · GitHub', $title); $this->assertEquals('Refactoring · GitHub', $title);
$this->assertEmpty($desc); $this->assertEmpty($desc);
@ -290,18 +322,18 @@ public function testCurlDownloadCallbackOkNoCharset()
/** /**
* Test the download callback with valid values and no charset * Test the download callback with valid values and no charset
*/ */
public function testCurlDownloadCallbackOkHtmlCharset() public function testCurlDownloadCallbackOkHtmlCharset(): void
{ {
$charset = null;
$callback = get_curl_download_callback( $callback = get_curl_download_callback(
$charset, $charset,
$title, $title,
$desc, $desc,
$keywords, $keywords,
false, false
'ut_curl_getinfo_no_charset'
); );
$data = [ $data = [
'HTTP/1.1 200 OK',
'<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />', '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
'end' => 'th=device-width">' 'end' => 'th=device-width">'
. '<title>Refactoring · GitHub</title>' . '<title>Refactoring · GitHub</title>'
@ -310,14 +342,10 @@ public function testCurlDownloadCallbackOkHtmlCharset()
. '<meta name="description" content="desc" />' . '<meta name="description" content="desc" />'
. '<meta name="keywords" content="key1,key2" />', . '<meta name="keywords" content="key1,key2" />',
]; ];
foreach ($data as $key => $line) { foreach ($data as $chunk) {
$ignore = null; static::assertSame(strlen($chunk), $callback(null, $chunk));
$expected = $key !== 'end' ? strlen($line) : false;
$this->assertEquals($expected, $callback($ignore, $line));
if ($expected === false) {
break;
}
} }
$this->assertEquals('utf-8', $charset); $this->assertEquals('utf-8', $charset);
$this->assertEquals('Refactoring · GitHub', $title); $this->assertEquals('Refactoring · GitHub', $title);
$this->assertEmpty($desc); $this->assertEmpty($desc);
@ -327,25 +355,26 @@ public function testCurlDownloadCallbackOkHtmlCharset()
/** /**
* Test the download callback with valid values and no title * Test the download callback with valid values and no title
*/ */
public function testCurlDownloadCallbackOkNoTitle() public function testCurlDownloadCallbackOkNoTitle(): void
{ {
$charset = 'utf-8';
$callback = get_curl_download_callback( $callback = get_curl_download_callback(
$charset, $charset,
$title, $title,
$desc, $desc,
$keywords, $keywords,
false, false
'ut_curl_getinfo_ok'
); );
$data = [ $data = [
'HTTP/1.1 200 OK',
'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea', 'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
'ignored', 'ignored',
]; ];
foreach ($data as $key => $line) {
$ignore = null; foreach ($data as $chunk) {
$this->assertEquals(strlen($line), $callback($ignore, $line)); static::assertSame(strlen($chunk), $callback(null, $chunk));
} }
$this->assertEquals('utf-8', $charset); $this->assertEquals('utf-8', $charset);
$this->assertEmpty($title); $this->assertEmpty($title);
$this->assertEmpty($desc); $this->assertEmpty($desc);
@ -353,81 +382,55 @@ public function testCurlDownloadCallbackOkNoTitle()
} }
/** /**
* Test the download callback with an invalid content type. * Test the header callback with an invalid content type.
*/ */
public function testCurlDownloadCallbackInvalidContentType() public function testCurlHeaderCallbackInvalidContentType(): void
{ {
$callback = get_curl_download_callback( $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_ct_ko');
$charset, $data = [
$title, 'HTTP/1.1 200 OK',
$desc, ];
$keywords,
false, static::assertFalse($callback(null, $data[0]));
'ut_curl_getinfo_ct_ko' static::assertNull($charset);
);
$ignore = null;
$this->assertFalse($callback($ignore, ''));
$this->assertEmpty($charset);
$this->assertEmpty($title);
} }
/** /**
* Test the download callback with an invalid response code. * Test the header callback with an invalid response code.
*/ */
public function testCurlDownloadCallbackInvalidResponseCode() public function testCurlHeaderCallbackInvalidResponseCode(): void
{ {
$callback = $callback = get_curl_download_callback( $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_rc_ko');
$charset,
$title, static::assertFalse($callback(null, ''));
$desc, static::assertNull($charset);
$keywords,
false,
'ut_curl_getinfo_rc_ko'
);
$ignore = null;
$this->assertFalse($callback($ignore, ''));
$this->assertEmpty($charset);
$this->assertEmpty($title);
} }
/** /**
* Test the download callback with an invalid content type and response code. * Test the header callback with an invalid content type and response code.
*/ */
public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode() public function testCurlHeaderCallbackInvalidContentTypeAndResponseCode(): void
{ {
$callback = $callback = get_curl_download_callback( $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_rs_ct_ko');
$charset,
$title, static::assertFalse($callback(null, ''));
$desc, static::assertNull($charset);
$keywords,
false,
'ut_curl_getinfo_rs_ct_ko'
);
$ignore = null;
$this->assertFalse($callback($ignore, ''));
$this->assertEmpty($charset);
$this->assertEmpty($title);
} }
/** /**
* Test the download callback with valid value, and retrieve_description option enabled. * Test the download callback with valid value, and retrieve_description option enabled.
*/ */
public function testCurlDownloadCallbackOkWithDesc() public function testCurlDownloadCallbackOkWithDesc(): void
{ {
$charset = 'utf-8';
$callback = get_curl_download_callback( $callback = get_curl_download_callback(
$charset, $charset,
$title, $title,
$desc, $desc,
$keywords, $keywords,
true, true
'ut_curl_getinfo_ok'
); );
$data = [ $data = [
'HTTP/1.1 200 OK',
'Server: GitHub.com',
'Date: Sat, 28 Oct 2017 12:01:33 GMT',
'Content-Type: text/html; charset=utf-8',
'Status: 200 OK',
'th=device-width">' 'th=device-width">'
. '<title>Refactoring · GitHub</title>' . '<title>Refactoring · GitHub</title>'
. '<link rel="search" type="application/opensea', . '<link rel="search" type="application/opensea',
@ -435,14 +438,11 @@ public function testCurlDownloadCallbackOkWithDesc()
. '<meta name="description" content="link desc" />' . '<meta name="description" content="link desc" />'
. '<meta name="keywords" content="key1,key2" />', . '<meta name="keywords" content="key1,key2" />',
]; ];
foreach ($data as $key => $line) {
$ignore = null; foreach ($data as $chunk) {
$expected = $key !== 'end' ? strlen($line) : false; static::assertSame(strlen($chunk), $callback(null, $chunk));
$this->assertEquals($expected, $callback($ignore, $line));
if ($expected === false) {
break;
}
} }
$this->assertEquals('utf-8', $charset); $this->assertEquals('utf-8', $charset);
$this->assertEquals('Refactoring · GitHub', $title); $this->assertEquals('Refactoring · GitHub', $title);
$this->assertEquals('link desc', $desc); $this->assertEquals('link desc', $desc);
@ -453,8 +453,9 @@ public function testCurlDownloadCallbackOkWithDesc()
* Test the download callback with valid value, and retrieve_description option enabled, * Test the download callback with valid value, and retrieve_description option enabled,
* but no desc or keyword defined in the page. * but no desc or keyword defined in the page.
*/ */
public function testCurlDownloadCallbackOkWithDescNotFound() public function testCurlDownloadCallbackOkWithDescNotFound(): void
{ {
$charset = 'utf-8';
$callback = get_curl_download_callback( $callback = get_curl_download_callback(
$charset, $charset,
$title, $title,
@ -464,24 +465,16 @@ public function testCurlDownloadCallbackOkWithDescNotFound()
'ut_curl_getinfo_ok' 'ut_curl_getinfo_ok'
); );
$data = [ $data = [
'HTTP/1.1 200 OK',
'Server: GitHub.com',
'Date: Sat, 28 Oct 2017 12:01:33 GMT',
'Content-Type: text/html; charset=utf-8',
'Status: 200 OK',
'th=device-width">' 'th=device-width">'
. '<title>Refactoring · GitHub</title>' . '<title>Refactoring · GitHub</title>'
. '<link rel="search" type="application/opensea', . '<link rel="search" type="application/opensea',
'end' => '<title>ignored</title>', 'end' => '<title>ignored</title>',
]; ];
foreach ($data as $key => $line) {
$ignore = null; foreach ($data as $chunk) {
$expected = $key !== 'end' ? strlen($line) : false; static::assertSame(strlen($chunk), $callback(null, $chunk));
$this->assertEquals($expected, $callback($ignore, $line));
if ($expected === false) {
break;
}
} }
$this->assertEquals('utf-8', $charset); $this->assertEquals('utf-8', $charset);
$this->assertEquals('Refactoring · GitHub', $title); $this->assertEquals('Refactoring · GitHub', $title);
$this->assertEmpty($desc); $this->assertEmpty($desc);

View file

@ -38,6 +38,7 @@ public function testFullRetrieval(): void
$remoteTitle = 'Remote Title '; $remoteTitle = 'Remote Title ';
$remoteDesc = 'Sometimes the meta description is relevant.'; $remoteDesc = 'Sometimes the meta description is relevant.';
$remoteTags = 'abc def'; $remoteTags = 'abc def';
$remoteCharset = 'utf-8';
$expectedResult = [ $expectedResult = [
'title' => $remoteTitle, 'title' => $remoteTitle,
@ -45,11 +46,28 @@ public function testFullRetrieval(): void
'tags' => $remoteTags, 'tags' => $remoteTags,
]; ];
$this->httpAccess
->expects(static::once())
->method('getCurlHeaderCallback')
->willReturnCallback(
function (&$charset) use (
$remoteCharset
): callable {
return function () use (
&$charset,
$remoteCharset
): void {
$charset = $remoteCharset;
};
}
)
;
$this->httpAccess $this->httpAccess
->expects(static::once()) ->expects(static::once())
->method('getCurlDownloadCallback') ->method('getCurlDownloadCallback')
->willReturnCallback( ->willReturnCallback(
function (&$charset, &$title, &$description, &$tags) use ( function (&$charset, &$title, &$description, &$tags) use (
$remoteCharset,
$remoteTitle, $remoteTitle,
$remoteDesc, $remoteDesc,
$remoteTags $remoteTags
@ -59,11 +77,13 @@ function (&$charset, &$title, &$description, &$tags) use (
&$title, &$title,
&$description, &$description,
&$tags, &$tags,
$remoteCharset,
$remoteTitle, $remoteTitle,
$remoteDesc, $remoteDesc,
$remoteTags $remoteTags
): void { ): void {
$charset = 'ISO-8859-1'; static::assertSame($remoteCharset, $charset);
$title = $remoteTitle; $title = $remoteTitle;
$description = $remoteDesc; $description = $remoteDesc;
$tags = $remoteTags; $tags = $remoteTags;
@ -75,8 +95,9 @@ function (&$charset, &$title, &$description, &$tags) use (
->expects(static::once()) ->expects(static::once())
->method('getHttpResponse') ->method('getHttpResponse')
->with($url, 30, 4194304) ->with($url, 30, 4194304)
->willReturnCallback(function($url, $timeout, $maxBytes, $callback): void { ->willReturnCallback(function($url, $timeout, $maxBytes, $headerCallback, $dlCallback): void {
$callback(); $headerCallback();
$dlCallback();
}) })
; ;
@ -102,8 +123,17 @@ public function testEmptyRetrieval(): void
->expects(static::once()) ->expects(static::once())
->method('getCurlDownloadCallback') ->method('getCurlDownloadCallback')
->willReturnCallback( ->willReturnCallback(
function (&$charset, &$title, &$description, &$tags): callable { function (): callable {
return function () use (&$charset, &$title, &$description, &$tags): void {}; return function (): void {};
}
)
;
$this->httpAccess
->expects(static::once())
->method('getCurlHeaderCallback')
->willReturnCallback(
function (): callable {
return function (): void {};
} }
) )
; ;
@ -111,8 +141,9 @@ function (&$charset, &$title, &$description, &$tags): callable {
->expects(static::once()) ->expects(static::once())
->method('getHttpResponse') ->method('getHttpResponse')
->with($url, 30, 4194304) ->with($url, 30, 4194304)
->willReturnCallback(function($url, $timeout, $maxBytes, $callback): void { ->willReturnCallback(function($url, $timeout, $maxBytes, $headerCallback, $dlCallback): void {
$callback(); $headerCallback();
$dlCallback();
}) })
; ;