From 5334090be04e66da5cb5c3ad487604b3733c5cac Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Thu, 15 Oct 2020 11:20:33 +0200 Subject: [PATCH] Improve metadata retrieval (performances and accuracy) - Use dedicated function to download headers to avoid apply multiple regexps on headers - Also try to extract title from meta tags --- application/http/HttpAccess.php | 22 ++- application/http/HttpUtils.php | 143 +++++++++------- application/http/MetadataRetriever.php | 1 + tests/bookmark/LinkUtilsTest.php | 227 ++++++++++++------------- tests/http/MetadataRetrieverTest.php | 45 ++++- 5 files changed, 249 insertions(+), 189 deletions(-) diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php index 81d9e076..646a5264 100644 --- a/application/http/HttpAccess.php +++ b/application/http/HttpAccess.php @@ -14,9 +14,14 @@ namespace Shaarli\Http; */ class HttpAccess { - public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) - { - return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction); + public function getHttpResponse( + $url, + $timeout = 30, + $maxBytes = 4194304, + $curlHeaderFunction = null, + $curlWriteFunction = null + ) { + return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction); } public function getCurlDownloadCallback( @@ -24,16 +29,19 @@ class HttpAccess &$title, &$description, &$keywords, - $retrieveDescription, - $curlGetInfo = 'curl_getinfo' + $retrieveDescription ) { return get_curl_download_callback( $charset, $title, $description, $keywords, - $retrieveDescription, - $curlGetInfo + $retrieveDescription ); } + + public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo') + { + return get_curl_header_callback($charset, $curlGetInfo); + } } diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php index 9f414073..28c12969 100644 --- a/application/http/HttpUtils.php +++ b/application/http/HttpUtils.php @@ -6,12 +6,14 @@ use Shaarli\Http\Url; * GET an HTTP URL to retrieve its content * Uses the cURL library or a fallback method * - * @param string $url URL to get (http://...) - * @param int $timeout network timeout (in seconds) - * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) - * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). - * Can be used to add download conditions on the - * headers (response code, content type, etc.). + * @param string $url URL to get (http://...) + * @param int $timeout network timeout (in seconds) + * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) + * @param callable|string $curlHeaderFunction Optional callback called during the download of headers + * (CURLOPT_HEADERFUNCTION) + * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). + * Can be used to add download conditions on the + * headers (response code, content type, etc.). * * @return array HTTP response headers, downloaded content * @@ -35,8 +37,13 @@ use Shaarli\Http\Url; * @see http://stackoverflow.com/q/9183178 * @see http://stackoverflow.com/q/1462720 */ -function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) -{ +function get_http_response( + $url, + $timeout = 30, + $maxBytes = 4194304, + $curlHeaderFunction = null, + $curlWriteFunction = null +) { $urlObj = new Url($url); $cleanUrl = $urlObj->idnToAscii(); @@ -70,7 +77,8 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF // General cURL settings curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($ch, CURLOPT_HEADER, true); + // Default header download if the $curlHeaderFunction is not defined + curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction)); curl_setopt( $ch, CURLOPT_HTTPHEADER, @@ -81,25 +89,21 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); - if (is_callable($curlWriteFunction)) { - curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); - } - // Max download size management curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); curl_setopt($ch, CURLOPT_NOPROGRESS, false); + if (is_callable($curlHeaderFunction)) { + curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction); + } + if (is_callable($curlWriteFunction)) { + curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); + } curl_setopt( $ch, CURLOPT_PROGRESSFUNCTION, - function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) { - if (version_compare(phpversion(), '5.5', '<')) { - // PHP version lower than 5.5 - // Callback has 4 arguments - $downloaded = $arg1; - } else { - // Callback has 5 arguments - $downloaded = $arg2; - } + function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) { + $downloaded = $arg2; + // Non-zero return stops downloading return ($downloaded > $maxBytes) ? 1 : 0; } @@ -493,53 +497,22 @@ function is_https($server) * Get cURL callback function for CURLOPT_WRITEFUNCTION * * @param string $charset to extract from the downloaded page (reference) - * @param string $title to extract from the downloaded page (reference) - * @param string $description to extract from the downloaded page (reference) - * @param string $keywords to extract from the downloaded page (reference) - * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content * @param string $curlGetInfo Optionally overrides curl_getinfo function * * @return Closure */ -function get_curl_download_callback( +function get_curl_header_callback( &$charset, - &$title, - &$description, - &$keywords, - $retrieveDescription, $curlGetInfo = 'curl_getinfo' ) { $isRedirected = false; - $currentChunk = 0; - $foundChunk = null; - /** - * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). - * - * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text' - * Then we extract the title and the charset and stop the download when it's done. - * - * @param resource $ch cURL resource - * @param string $data chunk of data being downloaded - * - * @return int|bool length of $data or false if we need to stop the download - */ - return function (&$ch, $data) use ( - $retrieveDescription, - $curlGetInfo, - &$charset, - &$title, - &$description, - &$keywords, - &$isRedirected, - &$currentChunk, - &$foundChunk - ) { - $currentChunk++; + return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) { $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); + $chunkLength = strlen($data); if (!empty($responseCode) && in_array($responseCode, [301, 302])) { $isRedirected = true; - return strlen($data); + return $chunkLength; } if (!empty($responseCode) && $responseCode !== 200) { return false; @@ -555,6 +528,56 @@ function get_curl_download_callback( if (!empty($contentType) && empty($charset)) { $charset = header_extract_charset($contentType); } + + return $chunkLength; + }; +} + +/** + * Get cURL callback function for CURLOPT_WRITEFUNCTION + * + * @param string $charset to extract from the downloaded page (reference) + * @param string $title to extract from the downloaded page (reference) + * @param string $description to extract from the downloaded page (reference) + * @param string $keywords to extract from the downloaded page (reference) + * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content + * @param string $curlGetInfo Optionally overrides curl_getinfo function + * + * @return Closure + */ +function get_curl_download_callback( + &$charset, + &$title, + &$description, + &$keywords, + $retrieveDescription +) { + $currentChunk = 0; + $foundChunk = null; + + /** + * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). + * + * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text' + * Then we extract the title and the charset and stop the download when it's done. + * + * @param resource $ch cURL resource + * @param string $data chunk of data being downloaded + * + * @return int|bool length of $data or false if we need to stop the download + */ + return function ($ch, $data) use ( + $retrieveDescription, + &$charset, + &$title, + &$description, + &$keywords, + &$currentChunk, + &$foundChunk + ) { + $chunkLength = strlen($data); + $currentChunk++; + if (empty($charset)) { $charset = html_extract_charset($data); } @@ -562,6 +585,10 @@ function get_curl_download_callback( $title = html_extract_title($data); $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; } + if (empty($title)) { + $title = html_extract_tag('title', $data); + $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; + } if ($retrieveDescription && empty($description)) { $description = html_extract_tag('description', $data); $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; @@ -591,6 +618,6 @@ function get_curl_download_callback( return false; } - return strlen($data); + return $chunkLength; }; } diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php index 2ca982e2..ba9bd40c 100644 --- a/application/http/MetadataRetriever.php +++ b/application/http/MetadataRetriever.php @@ -46,6 +46,7 @@ class MetadataRetriever $url, $this->conf->get('general.download_timeout', 30), $this->conf->get('general.download_max_size', 4194304), + $this->httpAccess->getCurlHeaderCallback($charset), $this->httpAccess->getCurlDownloadCallback( $charset, $title, diff --git a/tests/bookmark/LinkUtilsTest.php b/tests/bookmark/LinkUtilsTest.php index 29941c8c..3321242f 100644 --- a/tests/bookmark/LinkUtilsTest.php +++ b/tests/bookmark/LinkUtilsTest.php @@ -216,60 +216,91 @@ class LinkUtilsTest extends TestCase } /** - * Test the download callback with valid value + * Test the header callback with valid value */ - public function testCurlDownloadCallbackOk() + public function testCurlHeaderCallbackOk(): void { - $callback = get_curl_download_callback( - $charset, - $title, - $desc, - $keywords, - false, - 'ut_curl_getinfo_ok' - ); + $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_ok'); $data = [ 'HTTP/1.1 200 OK', 'Server: GitHub.com', 'Date: Sat, 28 Oct 2017 12:01:33 GMT', 'Content-Type: text/html; charset=utf-8', 'Status: 200 OK', - 'end' => 'th=device-width">' + ]; + + foreach ($data as $chunk) { + static::assertIsInt($callback(null, $chunk)); + } + + static::assertSame('utf-8', $charset); + } + + /** + * Test the download callback with valid value + */ + public function testCurlDownloadCallbackOk(): void + { + $charset = 'utf-8'; + $callback = get_curl_download_callback( + $charset, + $title, + $desc, + $keywords, + false + ); + + $data = [ + 'th=device-width">' . 'Refactoring · GitHub' . '' . '', ]; - foreach ($data as $key => $line) { - $ignore = null; - $expected = $key !== 'end' ? strlen($line) : false; - $this->assertEquals($expected, $callback($ignore, $line)); - if ($expected === false) { - break; - } + + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); } - $this->assertEquals('utf-8', $charset); - $this->assertEquals('Refactoring · GitHub', $title); - $this->assertEmpty($desc); - $this->assertEmpty($keywords); + + static::assertSame('utf-8', $charset); + static::assertSame('Refactoring · GitHub', $title); + static::assertEmpty($desc); + static::assertEmpty($keywords); + } + + /** + * Test the header callback with valid value + */ + public function testCurlHeaderCallbackNoCharset(): void + { + $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_no_charset'); + $data = [ + 'HTTP/1.1 200 OK', + ]; + + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); + } + + static::assertFalse($charset); } /** * Test the download callback with valid values and no charset */ - public function testCurlDownloadCallbackOkNoCharset() + public function testCurlDownloadCallbackOkNoCharset(): void { + $charset = null; $callback = get_curl_download_callback( $charset, $title, $desc, $keywords, - false, - 'ut_curl_getinfo_no_charset' + false ); + $data = [ - 'HTTP/1.1 200 OK', 'end' => 'th=device-width">' . 'Refactoring · GitHub' . '' . '', ]; - foreach ($data as $key => $line) { - $ignore = null; - $this->assertEquals(strlen($line), $callback($ignore, $line)); + + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); } + $this->assertEmpty($charset); $this->assertEquals('Refactoring · GitHub', $title); $this->assertEmpty($desc); @@ -290,18 +322,18 @@ class LinkUtilsTest extends TestCase /** * Test the download callback with valid values and no charset */ - public function testCurlDownloadCallbackOkHtmlCharset() + public function testCurlDownloadCallbackOkHtmlCharset(): void { + $charset = null; $callback = get_curl_download_callback( $charset, $title, $desc, $keywords, - false, - 'ut_curl_getinfo_no_charset' + false ); + $data = [ - 'HTTP/1.1 200 OK', '', 'end' => 'th=device-width">' . 'Refactoring · GitHub' @@ -310,14 +342,10 @@ class LinkUtilsTest extends TestCase . '' . '', ]; - foreach ($data as $key => $line) { - $ignore = null; - $expected = $key !== 'end' ? strlen($line) : false; - $this->assertEquals($expected, $callback($ignore, $line)); - if ($expected === false) { - break; - } + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); } + $this->assertEquals('utf-8', $charset); $this->assertEquals('Refactoring · GitHub', $title); $this->assertEmpty($desc); @@ -327,25 +355,26 @@ class LinkUtilsTest extends TestCase /** * Test the download callback with valid values and no title */ - public function testCurlDownloadCallbackOkNoTitle() + public function testCurlDownloadCallbackOkNoTitle(): void { + $charset = 'utf-8'; $callback = get_curl_download_callback( $charset, $title, $desc, $keywords, - false, - 'ut_curl_getinfo_ok' + false ); + $data = [ - 'HTTP/1.1 200 OK', 'end' => 'th=device-width">Refactoring · GitHub' . 'Refactoring · GitHub' . '' . '', ]; - foreach ($data as $key => $line) { - $ignore = null; - $expected = $key !== 'end' ? strlen($line) : false; - $this->assertEquals($expected, $callback($ignore, $line)); - if ($expected === false) { - break; - } + + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); } + $this->assertEquals('utf-8', $charset); $this->assertEquals('Refactoring · GitHub', $title); $this->assertEquals('link desc', $desc); @@ -453,8 +453,9 @@ class LinkUtilsTest extends TestCase * Test the download callback with valid value, and retrieve_description option enabled, * but no desc or keyword defined in the page. */ - public function testCurlDownloadCallbackOkWithDescNotFound() + public function testCurlDownloadCallbackOkWithDescNotFound(): void { + $charset = 'utf-8'; $callback = get_curl_download_callback( $charset, $title, @@ -464,24 +465,16 @@ class LinkUtilsTest extends TestCase 'ut_curl_getinfo_ok' ); $data = [ - 'HTTP/1.1 200 OK', - 'Server: GitHub.com', - 'Date: Sat, 28 Oct 2017 12:01:33 GMT', - 'Content-Type: text/html; charset=utf-8', - 'Status: 200 OK', 'th=device-width">' . 'Refactoring · GitHub' . '