Extract the title/charset during page download, and check content type

Use CURLOPT_WRITEFUNCTION to check the response code and content type (only allow HTML). Also extract the title and charset during downloading chunk of data, and stop it when everything has been extracted. Closes #579
2017-09-30 11:04:13 +02:00 · 2017-09-30 11:04:13 +02:00 · d65342e304
commit d65342e304
parent a59bbf50d7
4 changed files with 293 additions and 68 deletions
--- a/application/HttpUtils.php
+++ b/application/HttpUtils.php
@ -6,6 +6,8 @@
 * @param string          $url               URL to get (http://...)
 * @param int             $timeout           network timeout (in seconds)
 * @param int             $maxBytes          maximum downloaded bytes (default: 4 MiB)
 * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
 *                                           Can be used to add download conditions on the headers (response code, content type, etc.).
 *
 * @return array HTTP response headers, downloaded content
 *
@ -29,7 +31,7 @@
 * @see http://stackoverflow.com/q/9183178
 * @see http://stackoverflow.com/q/1462720
 */
-function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
+function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
 {
    $urlObj = new Url($url);
    $cleanUrl = $urlObj->idnToAscii();
@ -75,6 +77,10 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
    curl_setopt($ch, CURLOPT_TIMEOUT,           $timeout);
    curl_setopt($ch, CURLOPT_USERAGENT,         $userAgent);
    if (is_callable($curlWriteFunction)) {
        curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
    }
    // Max download size management
    curl_setopt($ch, CURLOPT_BUFFERSIZE,        1024);
    curl_setopt($ch, CURLOPT_NOPROGRESS,        false);
--- a/application/LinkUtils.php
+++ b/application/LinkUtils.php
@ -1,5 +1,54 @@
 <?php
 /**
 * Get cURL callback function for CURLOPT_WRITEFUNCTION
 *
 * @param string $charset     to extract from the downloaded page (reference)
 * @param string $title       to extract from the downloaded page (reference)
 * @param string $curlGetInfo Optionnaly overrides curl_getinfo function
 *
 * @return Closure
 */
 function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
 {
    /**
     * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
     *
     * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
     * Then we extract the title and the charset and stop the download when it's done.
     *
     * @param resource $ch   cURL resource
     * @param string   $data chunk of data being downloaded
     *
     * @return int|bool length of $data or false if we need to stop the download
     */
    return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) {
        $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
        if (!empty($responseCode) && $responseCode != 200) {
            return false;
        }
        $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
        if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
            return false;
        }
        if (empty($charset)) {
            $charset = header_extract_charset($contentType);
        }
        if (empty($charset)) {
            $charset = html_extract_charset($data);
        }
        if (empty($title)) {
            $title = html_extract_title($data);
        }
        // We got everything we want, stop the download.
        if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
            return false;
        }
        return strlen($data);
    };
 }
 /**
 * Extract title from an HTML document.
 *
@ -16,46 +65,18 @@ function html_extract_title($html)
 }
 /**
- * Determine charset from downloaded page.
+ * Extract charset from HTTP header if it's defined.
 * Priority:
 *   1. HTTP headers (Content type).
 *   2. HTML content page (tag <meta charset>).
 *   3. Use a default charset (default: UTF-8).
 *
- * @param array  $headers           HTTP headers array.
+ * @param string $header HTTP header Content-Type line.
 * @param string $htmlContent       HTML content where to look for charset.
 * @param string $defaultCharset    Default charset to apply if other methods failed.
 *
 * @return string Determined charset.
 */
 function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
 {
    if ($charset = headers_extract_charset($headers)) {
        return $charset;
    }
    if ($charset = html_extract_charset($htmlContent)) {
        return $charset;
    }
    return $defaultCharset;
 }
 /**
 * Extract charset from HTTP headers if it's defined.
 *
 * @param array $headers HTTP headers array.
 *
 * @return bool|string Charset string if found (lowercase), false otherwise.
 */
-function headers_extract_charset($headers)
+function header_extract_charset($header)
 {
-    if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
+    preg_match('/charset="?([^; ]+)/i', $header, $match);
        preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
    if (! empty($match[1])) {
        return strtolower(trim($match[1]));
    }
    }
    return false;
 }
--- a/index.php
+++ b/index.php
@ -1428,18 +1428,12 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history)
            // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.)
            if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) {
                // Short timeout to keep the application responsive
-                list($headers, $content) = get_http_response($url, 4);
+                // The callback will fill $charset and $title with data from the downloaded page.
-                if (strpos($headers[0], '200 OK') !== false) {
+                get_http_response($url, 25, 4194304, get_curl_download_callback($charset, $title));
                    // Retrieve charset.
                    $charset = get_charset($headers, $content);
                    // Extract title.
                    $title = html_extract_title($content);
                    // Re-encode title in utf-8 if necessary.
                if (! empty($title) && strtolower($charset) != 'utf-8') {
                    $title = mb_convert_encoding($title, 'utf-8', $charset);
                }
            }
            }
            if ($url == '') {
                $url = '?' . smallHash($linkdate . $LINKSDB->getNextId());
--- a/tests/LinkUtilsTest.php
+++ b/tests/LinkUtilsTest.php
@ -28,28 +28,14 @@ public function testHtmlExtractNonExistentTitle()
        $this->assertFalse(html_extract_title($html));
    }
    /**
     * Test get_charset() with all priorities.
     */
    public function testGetCharset()
    {
        $headers = array('Content-Type' => 'text/html; charset=Headers');
        $html = '<html><meta>stuff</meta><meta charset="Html"/></html>';
        $default = 'default';
        $this->assertEquals('headers', get_charset($headers, $html, $default));
        $this->assertEquals('html', get_charset(array(), $html, $default));
        $this->assertEquals($default, get_charset(array(), '', $default));
        $this->assertEquals('utf-8', get_charset(array(), ''));
    }
    /**
     * Test headers_extract_charset() when the charset is found.
     */
    public function testHeadersExtractExistentCharset()
    {
        $charset = 'x-MacCroatian';
-        $headers = array('Content-Type' => 'text/html; charset='. $charset);
+        $headers = 'text/html; charset='. $charset;
-        $this->assertEquals(strtolower($charset), headers_extract_charset($headers));
+        $this->assertEquals(strtolower($charset), header_extract_charset($headers));
    }
    /**
@ -57,11 +43,11 @@ public function testHeadersExtractExistentCharset()
     */
    public function testHeadersExtractNonExistentCharset()
    {
-        $headers = array();
+        $headers = '';
-        $this->assertFalse(headers_extract_charset($headers));
+        $this->assertFalse(header_extract_charset($headers));
-        $headers = array('Content-Type' => 'text/html');
+        $headers = 'text/html';
-        $this->assertFalse(headers_extract_charset($headers));
+        $this->assertFalse(header_extract_charset($headers));
    }
    /**
@ -85,6 +71,131 @@ public function testHtmlExtractNonExistentCharset()
        $this->assertFalse(html_extract_charset($html));
    }
    /**
     * Test the download callback with valid value
     */
    public function testCurlDownloadCallbackOk()
    {
        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
        $data = [
            'HTTP/1.1 200 OK',
            'Server: GitHub.com',
            'Date: Sat, 28 Oct 2017 12:01:33 GMT',
            'Content-Type: text/html; charset=utf-8',
            'Status: 200 OK',
            'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
            '<title>ignored</title>',
        ];
        foreach ($data as $key => $line) {
            $ignore = null;
            $expected = $key !== 'end' ? strlen($line) : false;
            $this->assertEquals($expected, $callback($ignore, $line));
            if ($expected === false) {
                break;
            }
        }
        $this->assertEquals('utf-8', $charset);
        $this->assertEquals('Refactoring · GitHub', $title);
    }
    /**
     * Test the download callback with valid values and no charset
     */
    public function testCurlDownloadCallbackOkNoCharset()
    {
        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
        $data = [
            'HTTP/1.1 200 OK',
            'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
            '<title>ignored</title>',
        ];
        foreach ($data as $key => $line) {
            $ignore = null;
            $this->assertEquals(strlen($line), $callback($ignore, $line));
        }
        $this->assertEmpty($charset);
        $this->assertEquals('Refactoring · GitHub', $title);
    }
    /**
     * Test the download callback with valid values and no charset
     */
    public function testCurlDownloadCallbackOkHtmlCharset()
    {
        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
        $data = [
            'HTTP/1.1 200 OK',
            '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
            'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
            '<title>ignored</title>',
        ];
        foreach ($data as $key => $line) {
            $ignore = null;
            $expected = $key !== 'end' ? strlen($line) : false;
            $this->assertEquals($expected, $callback($ignore, $line));
            if ($expected === false) {
                break;
            }
        }
        $this->assertEquals('utf-8', $charset);
        $this->assertEquals('Refactoring · GitHub', $title);
    }
    /**
     * Test the download callback with valid values and no title
     */
    public function testCurlDownloadCallbackOkNoTitle()
    {
        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
        $data = [
            'HTTP/1.1 200 OK',
            'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
            'ignored',
        ];
        foreach ($data as $key => $line) {
            $ignore = null;
            $this->assertEquals(strlen($line), $callback($ignore, $line));
        }
        $this->assertEquals('utf-8', $charset);
        $this->assertEmpty($title);
    }
    /**
     * Test the download callback with an invalid content type.
     */
    public function testCurlDownloadCallbackInvalidContentType()
    {
        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ct_ko');
        $ignore = null;
        $this->assertFalse($callback($ignore, ''));
        $this->assertEmpty($charset);
        $this->assertEmpty($title);
    }
    /**
     * Test the download callback with an invalid response code.
     */
    public function testCurlDownloadCallbackInvalidResponseCode()
    {
        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rc_ko');
        $ignore = null;
        $this->assertFalse($callback($ignore, ''));
        $this->assertEmpty($charset);
        $this->assertEmpty($title);
    }
    /**
     * Test the download callback with an invalid content type and response code.
     */
    public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode()
    {
        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rs_ct_ko');
        $ignore = null;
        $this->assertFalse($callback($ignore, ''));
        $this->assertEmpty($charset);
        $this->assertEmpty($title);
    }
    /**
     * Test count_private.
     */
@ -182,3 +293,96 @@ private function getHashtagLink($hashtag, $index = '')
        return str_replace('$1', $hashtag, $hashtagLink);
    }
 }
 // old style mock: PHPUnit doesn't allow function mock
 /**
 * Returns code 200 or html content type.
 *
 * @param resource $ch   cURL resource
 * @param int      $type cURL info type
 *
 * @return int|string 200 or 'text/html'
 */
 function ut_curl_getinfo_ok($ch, $type)
 {
    switch ($type) {
        case CURLINFO_RESPONSE_CODE:
            return 200;
        case CURLINFO_CONTENT_TYPE:
            return 'text/html; charset=utf-8';
    }
 }
 /**
 * Returns code 200 or html content type without charset.
 *
 * @param resource $ch   cURL resource
 * @param int      $type cURL info type
 *
 * @return int|string 200 or 'text/html'
 */
 function ut_curl_getinfo_no_charset($ch, $type)
 {
    switch ($type) {
        case CURLINFO_RESPONSE_CODE:
            return 200;
        case CURLINFO_CONTENT_TYPE:
            return 'text/html';
    }
 }
 /**
 * Invalid response code.
 *
 * @param resource $ch   cURL resource
 * @param int      $type cURL info type
 *
 * @return int|string 404 or 'text/html'
 */
 function ut_curl_getinfo_rc_ko($ch, $type)
 {
    switch ($type) {
        case CURLINFO_RESPONSE_CODE:
            return 404;
        case CURLINFO_CONTENT_TYPE:
            return 'text/html; charset=utf-8';
    }
 }
 /**
 * Invalid content type.
 *
 * @param resource $ch   cURL resource
 * @param int      $type cURL info type
 *
 * @return int|string 200 or 'text/plain'
 */
 function ut_curl_getinfo_ct_ko($ch, $type)
 {
    switch ($type) {
        case CURLINFO_RESPONSE_CODE:
            return 200;
        case CURLINFO_CONTENT_TYPE:
            return 'text/plain';
    }
 }
 /**
 * Invalid response code and content type.
 *
 * @param resource $ch   cURL resource
 * @param int      $type cURL info type
 *
 * @return int|string 404 or 'text/plain'
 */
 function ut_curl_getinfo_rs_ct_ko($ch, $type)
 {
    switch ($type) {
        case CURLINFO_RESPONSE_CODE:
            return 404;
        case CURLINFO_CONTENT_TYPE:
            return 'text/plain';
    }
 }