diff --git a/application/HttpUtils.php b/application/HttpUtils.php index c9371b5..83a4c5e 100644 --- a/application/HttpUtils.php +++ b/application/HttpUtils.php @@ -3,9 +3,11 @@ * GET an HTTP URL to retrieve its content * Uses the cURL library or a fallback method * - * @param string $url URL to get (http://...) - * @param int $timeout network timeout (in seconds) - * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) + * @param string $url URL to get (http://...) + * @param int $timeout network timeout (in seconds) + * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) + * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). + * Can be used to add download conditions on the headers (response code, content type, etc.). * * @return array HTTP response headers, downloaded content * @@ -29,7 +31,7 @@ * @see http://stackoverflow.com/q/9183178 * @see http://stackoverflow.com/q/1462720 */ -function get_http_response($url, $timeout = 30, $maxBytes = 4194304) +function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) { $urlObj = new Url($url); $cleanUrl = $urlObj->idnToAscii(); @@ -75,6 +77,10 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304) curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); + if (is_callable($curlWriteFunction)) { + curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); + } + // Max download size management curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); curl_setopt($ch, CURLOPT_NOPROGRESS, false); diff --git a/application/LinkUtils.php b/application/LinkUtils.php index e3d95d0..3705f7e 100644 --- a/application/LinkUtils.php +++ b/application/LinkUtils.php @@ -1,5 +1,54 @@ ). - * 3. Use a default charset (default: UTF-8). + * Extract charset from HTTP header if it's defined. * - * @param array $headers HTTP headers array. - * @param string $htmlContent HTML content where to look for charset. - * @param string $defaultCharset Default charset to apply if other methods failed. - * - * @return string Determined charset. - */ -function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8') -{ - if ($charset = headers_extract_charset($headers)) { - return $charset; - } - - if ($charset = html_extract_charset($htmlContent)) { - return $charset; - } - - return $defaultCharset; -} - -/** - * Extract charset from HTTP headers if it's defined. - * - * @param array $headers HTTP headers array. + * @param string $header HTTP header Content-Type line. * * @return bool|string Charset string if found (lowercase), false otherwise. */ -function headers_extract_charset($headers) +function header_extract_charset($header) { - if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) { - preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match); - if (! empty($match[1])) { - return strtolower(trim($match[1])); - } + preg_match('/charset="?([^; ]+)/i', $header, $match); + if (! empty($match[1])) { + return strtolower(trim($match[1])); } return false; diff --git a/index.php b/index.php index 27335a3..d57789e 100644 --- a/index.php +++ b/index.php @@ -1425,16 +1425,10 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history, $sessionManager) // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.) if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) { // Short timeout to keep the application responsive - list($headers, $content) = get_http_response($url, 4); - if (strpos($headers[0], '200 OK') !== false) { - // Retrieve charset. - $charset = get_charset($headers, $content); - // Extract title. - $title = html_extract_title($content); - // Re-encode title in utf-8 if necessary. - if (! empty($title) && strtolower($charset) != 'utf-8') { - $title = mb_convert_encoding($title, 'utf-8', $charset); - } + // The callback will fill $charset and $title with data from the downloaded page. + get_http_response($url, 25, 4194304, get_curl_download_callback($charset, $title)); + if (! empty($title) && strtolower($charset) != 'utf-8') { + $title = mb_convert_encoding($title, 'utf-8', $charset); } } diff --git a/tests/LinkUtilsTest.php b/tests/LinkUtilsTest.php index 9967932..7fbd59b 100644 --- a/tests/LinkUtilsTest.php +++ b/tests/LinkUtilsTest.php @@ -28,28 +28,14 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase $this->assertFalse(html_extract_title($html)); } - /** - * Test get_charset() with all priorities. - */ - public function testGetCharset() - { - $headers = array('Content-Type' => 'text/html; charset=Headers'); - $html = 'stuff'; - $default = 'default'; - $this->assertEquals('headers', get_charset($headers, $html, $default)); - $this->assertEquals('html', get_charset(array(), $html, $default)); - $this->assertEquals($default, get_charset(array(), '', $default)); - $this->assertEquals('utf-8', get_charset(array(), '')); - } - /** * Test headers_extract_charset() when the charset is found. */ public function testHeadersExtractExistentCharset() { $charset = 'x-MacCroatian'; - $headers = array('Content-Type' => 'text/html; charset='. $charset); - $this->assertEquals(strtolower($charset), headers_extract_charset($headers)); + $headers = 'text/html; charset='. $charset; + $this->assertEquals(strtolower($charset), header_extract_charset($headers)); } /** @@ -57,11 +43,11 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase */ public function testHeadersExtractNonExistentCharset() { - $headers = array(); - $this->assertFalse(headers_extract_charset($headers)); + $headers = ''; + $this->assertFalse(header_extract_charset($headers)); - $headers = array('Content-Type' => 'text/html'); - $this->assertFalse(headers_extract_charset($headers)); + $headers = 'text/html'; + $this->assertFalse(header_extract_charset($headers)); } /** @@ -85,6 +71,131 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase $this->assertFalse(html_extract_charset($html)); } + /** + * Test the download callback with valid value + */ + public function testCurlDownloadCallbackOk() + { + $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok'); + $data = [ + 'HTTP/1.1 200 OK', + 'Server: GitHub.com', + 'Date: Sat, 28 Oct 2017 12:01:33 GMT', + 'Content-Type: text/html; charset=utf-8', + 'Status: 200 OK', + 'end' => 'th=device-width">Refactoring · GitHubRefactoring · GitHub', + 'end' => 'th=device-width">Refactoring · GitHubRefactoring · GitHub