Improve metadata retrieval (performances and accuracy)

- Use dedicated function to download headers to avoid apply multiple regexps on headers - Also try to extract title from meta tags
2020-10-15 11:20:33 +02:00 · 2020-10-15 11:20:33 +02:00 · 5334090be0
commit 5334090be0
parent 4cf3564d28
5 changed files with 249 additions and 189 deletions
--- a/application/http/HttpAccess.php
+++ b/application/http/HttpAccess.php
@ -14,9 +14,14 @@ namespace Shaarli\Http;
 */
 class HttpAccess
 {
-    public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
-    {
-        return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction);
+    public function getHttpResponse(
+        $url,
+        $timeout = 30,
+        $maxBytes = 4194304,
+        $curlHeaderFunction = null,
+        $curlWriteFunction = null
+    ) {
+        return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction);
    }

    public function getCurlDownloadCallback(
@ -24,16 +29,19 @@ class HttpAccess
        &$title,
        &$description,
        &$keywords,
-        $retrieveDescription,
-        $curlGetInfo = 'curl_getinfo'
+        $retrieveDescription
    ) {
        return get_curl_download_callback(
            $charset,
            $title,
            $description,
            $keywords,
-            $retrieveDescription,
-            $curlGetInfo
+            $retrieveDescription
        );
    }
+
+    public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo')
+    {
+        return get_curl_header_callback($charset, $curlGetInfo);
+    }
 }
--- a/application/http/HttpUtils.php
+++ b/application/http/HttpUtils.php
@ -6,12 +6,14 @@ use Shaarli\Http\Url;
 * GET an HTTP URL to retrieve its content
 * Uses the cURL library or a fallback method
 *
- * @param string          $url               URL to get (http://...)
- * @param int             $timeout           network timeout (in seconds)
- * @param int             $maxBytes          maximum downloaded bytes (default: 4 MiB)
- * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
- *                                           Can be used to add download conditions on the
- *                                           headers (response code, content type, etc.).
+ * @param string          $url                URL to get (http://...)
+ * @param int             $timeout            network timeout (in seconds)
+ * @param int             $maxBytes           maximum downloaded bytes (default: 4 MiB)
+ * @param callable|string $curlHeaderFunction Optional callback called during the download of headers
+ *                                            (CURLOPT_HEADERFUNCTION)
+ * @param callable|string $curlWriteFunction  Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
+ *                                            Can be used to add download conditions on the
+ *                                            headers (response code, content type, etc.).
 *
 * @return array HTTP response headers, downloaded content
 *
@ -35,8 +37,13 @@ use Shaarli\Http\Url;
 * @see http://stackoverflow.com/q/9183178
 * @see http://stackoverflow.com/q/1462720
 */
-function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
-{
+function get_http_response(
+    $url,
+    $timeout = 30,
+    $maxBytes = 4194304,
+    $curlHeaderFunction = null,
+    $curlWriteFunction = null
+) {
    $urlObj = new Url($url);
    $cleanUrl = $urlObj->idnToAscii();

@ -70,7 +77,8 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
    // General cURL settings
    curl_setopt($ch, CURLOPT_AUTOREFERER, true);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
-    curl_setopt($ch, CURLOPT_HEADER, true);
+    // Default header download if the $curlHeaderFunction is not defined
+    curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
    curl_setopt(
        $ch,
        CURLOPT_HTTPHEADER,
@ -81,25 +89,21 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
    curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
    curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);

-    if (is_callable($curlWriteFunction)) {
-        curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
-    }
-
    // Max download size management
    curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
    curl_setopt($ch, CURLOPT_NOPROGRESS, false);
+    if (is_callable($curlHeaderFunction)) {
+        curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
+    }
+    if (is_callable($curlWriteFunction)) {
+        curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
+    }
    curl_setopt(
        $ch,
        CURLOPT_PROGRESSFUNCTION,
-        function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) {
-            if (version_compare(phpversion(), '5.5', '<')) {
-                // PHP version lower than 5.5
-                // Callback has 4 arguments
-                $downloaded = $arg1;
-            } else {
-                // Callback has 5 arguments
-                $downloaded = $arg2;
-            }
+        function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
+            $downloaded = $arg2;
+
            // Non-zero return stops downloading
            return ($downloaded > $maxBytes) ? 1 : 0;
        }
@ -493,53 +497,22 @@ function is_https($server)
 * Get cURL callback function for CURLOPT_WRITEFUNCTION
 *
 * @param string $charset     to extract from the downloaded page (reference)
- * @param string $title       to extract from the downloaded page (reference)
- * @param string $description to extract from the downloaded page (reference)
- * @param string $keywords    to extract from the downloaded page (reference)
- * @param bool   $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
 * @param string $curlGetInfo Optionally overrides curl_getinfo function
 *
 * @return Closure
 */
-function get_curl_download_callback(
+function get_curl_header_callback(
    &$charset,
-    &$title,
-    &$description,
-    &$keywords,
-    $retrieveDescription,
    $curlGetInfo = 'curl_getinfo'
 ) {
    $isRedirected = false;
-    $currentChunk = 0;
-    $foundChunk = null;

-    /**
-     * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
-     *
-     * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
-     * Then we extract the title and the charset and stop the download when it's done.
-     *
-     * @param resource $ch   cURL resource
-     * @param string   $data chunk of data being downloaded
-     *
-     * @return int|bool length of $data or false if we need to stop the download
-     */
-    return function (&$ch, $data) use (
-        $retrieveDescription,
-        $curlGetInfo,
-        &$charset,
-        &$title,
-        &$description,
-        &$keywords,
-        &$isRedirected,
-        &$currentChunk,
-        &$foundChunk
-    ) {
-        $currentChunk++;
+    return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
        $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
+        $chunkLength = strlen($data);
        if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
            $isRedirected = true;
-            return strlen($data);
+            return $chunkLength;
        }
        if (!empty($responseCode) && $responseCode !== 200) {
            return false;
@ -555,6 +528,56 @@ function get_curl_download_callback(
        if (!empty($contentType) && empty($charset)) {
            $charset = header_extract_charset($contentType);
        }
+
+        return $chunkLength;
+    };
+}
+
+/**
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
+ *
+ * @param string $charset     to extract from the downloaded page (reference)
+ * @param string $title       to extract from the downloaded page (reference)
+ * @param string $description to extract from the downloaded page (reference)
+ * @param string $keywords    to extract from the downloaded page (reference)
+ * @param bool   $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
+ * @param string $curlGetInfo Optionally overrides curl_getinfo function
+ *
+ * @return Closure
+ */
+function get_curl_download_callback(
+    &$charset,
+    &$title,
+    &$description,
+    &$keywords,
+    $retrieveDescription
+) {
+    $currentChunk = 0;
+    $foundChunk = null;
+
+    /**
+     * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
+     *
+     * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
+     * Then we extract the title and the charset and stop the download when it's done.
+     *
+     * @param resource $ch   cURL resource
+     * @param string   $data chunk of data being downloaded
+     *
+     * @return int|bool length of $data or false if we need to stop the download
+     */
+    return function ($ch, $data) use (
+        $retrieveDescription,
+        &$charset,
+        &$title,
+        &$description,
+        &$keywords,
+        &$currentChunk,
+        &$foundChunk
+    ) {
+        $chunkLength = strlen($data);
+        $currentChunk++;
+
        if (empty($charset)) {
            $charset = html_extract_charset($data);
        }
@ -562,6 +585,10 @@ function get_curl_download_callback(
            $title = html_extract_title($data);
            $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
        }
+        if (empty($title)) {
+            $title = html_extract_tag('title', $data);
+            $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
+        }
        if ($retrieveDescription && empty($description)) {
            $description = html_extract_tag('description', $data);
            $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
@ -591,6 +618,6 @@ function get_curl_download_callback(
            return false;
        }

-        return strlen($data);
+        return $chunkLength;
    };
 }
--- a/application/http/MetadataRetriever.php
+++ b/application/http/MetadataRetriever.php
@ -46,6 +46,7 @@ class MetadataRetriever
            $url,
            $this->conf->get('general.download_timeout', 30),
            $this->conf->get('general.download_max_size', 4194304),
+            $this->httpAccess->getCurlHeaderCallback($charset),
            $this->httpAccess->getCurlDownloadCallback(
                $charset,
                $title,