Merge pull request #977 from ArthurHoaro/feature/dl-filter
Extract the title/charset during page download, and check content type
This commit is contained in:
commit
d449f79a0d
4 changed files with 293 additions and 68 deletions
|
@ -6,6 +6,8 @@
|
||||||
* @param string $url URL to get (http://...)
|
* @param string $url URL to get (http://...)
|
||||||
* @param int $timeout network timeout (in seconds)
|
* @param int $timeout network timeout (in seconds)
|
||||||
* @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
|
* @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
|
||||||
|
* @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
|
||||||
|
* Can be used to add download conditions on the headers (response code, content type, etc.).
|
||||||
*
|
*
|
||||||
* @return array HTTP response headers, downloaded content
|
* @return array HTTP response headers, downloaded content
|
||||||
*
|
*
|
||||||
|
@ -29,7 +31,7 @@
|
||||||
* @see http://stackoverflow.com/q/9183178
|
* @see http://stackoverflow.com/q/9183178
|
||||||
* @see http://stackoverflow.com/q/1462720
|
* @see http://stackoverflow.com/q/1462720
|
||||||
*/
|
*/
|
||||||
function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
|
function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
|
||||||
{
|
{
|
||||||
$urlObj = new Url($url);
|
$urlObj = new Url($url);
|
||||||
$cleanUrl = $urlObj->idnToAscii();
|
$cleanUrl = $urlObj->idnToAscii();
|
||||||
|
@ -75,6 +77,10 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
|
||||||
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
|
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
|
||||||
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
|
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
|
||||||
|
|
||||||
|
if (is_callable($curlWriteFunction)) {
|
||||||
|
curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
|
||||||
|
}
|
||||||
|
|
||||||
// Max download size management
|
// Max download size management
|
||||||
curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
|
curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
|
||||||
curl_setopt($ch, CURLOPT_NOPROGRESS, false);
|
curl_setopt($ch, CURLOPT_NOPROGRESS, false);
|
||||||
|
|
|
@ -1,5 +1,54 @@
|
||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get cURL callback function for CURLOPT_WRITEFUNCTION
|
||||||
|
*
|
||||||
|
* @param string $charset to extract from the downloaded page (reference)
|
||||||
|
* @param string $title to extract from the downloaded page (reference)
|
||||||
|
* @param string $curlGetInfo Optionnaly overrides curl_getinfo function
|
||||||
|
*
|
||||||
|
* @return Closure
|
||||||
|
*/
|
||||||
|
function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
|
||||||
|
*
|
||||||
|
* While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
|
||||||
|
* Then we extract the title and the charset and stop the download when it's done.
|
||||||
|
*
|
||||||
|
* @param resource $ch cURL resource
|
||||||
|
* @param string $data chunk of data being downloaded
|
||||||
|
*
|
||||||
|
* @return int|bool length of $data or false if we need to stop the download
|
||||||
|
*/
|
||||||
|
return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) {
|
||||||
|
$responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
|
||||||
|
if (!empty($responseCode) && $responseCode != 200) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
$contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
|
||||||
|
if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (empty($charset)) {
|
||||||
|
$charset = header_extract_charset($contentType);
|
||||||
|
}
|
||||||
|
if (empty($charset)) {
|
||||||
|
$charset = html_extract_charset($data);
|
||||||
|
}
|
||||||
|
if (empty($title)) {
|
||||||
|
$title = html_extract_title($data);
|
||||||
|
}
|
||||||
|
// We got everything we want, stop the download.
|
||||||
|
if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return strlen($data);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract title from an HTML document.
|
* Extract title from an HTML document.
|
||||||
*
|
*
|
||||||
|
@ -16,46 +65,18 @@ function html_extract_title($html)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determine charset from downloaded page.
|
* Extract charset from HTTP header if it's defined.
|
||||||
* Priority:
|
|
||||||
* 1. HTTP headers (Content type).
|
|
||||||
* 2. HTML content page (tag <meta charset>).
|
|
||||||
* 3. Use a default charset (default: UTF-8).
|
|
||||||
*
|
*
|
||||||
* @param array $headers HTTP headers array.
|
* @param string $header HTTP header Content-Type line.
|
||||||
* @param string $htmlContent HTML content where to look for charset.
|
|
||||||
* @param string $defaultCharset Default charset to apply if other methods failed.
|
|
||||||
*
|
|
||||||
* @return string Determined charset.
|
|
||||||
*/
|
|
||||||
function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
|
|
||||||
{
|
|
||||||
if ($charset = headers_extract_charset($headers)) {
|
|
||||||
return $charset;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($charset = html_extract_charset($htmlContent)) {
|
|
||||||
return $charset;
|
|
||||||
}
|
|
||||||
|
|
||||||
return $defaultCharset;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Extract charset from HTTP headers if it's defined.
|
|
||||||
*
|
|
||||||
* @param array $headers HTTP headers array.
|
|
||||||
*
|
*
|
||||||
* @return bool|string Charset string if found (lowercase), false otherwise.
|
* @return bool|string Charset string if found (lowercase), false otherwise.
|
||||||
*/
|
*/
|
||||||
function headers_extract_charset($headers)
|
function header_extract_charset($header)
|
||||||
{
|
{
|
||||||
if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
|
preg_match('/charset="?([^; ]+)/i', $header, $match);
|
||||||
preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
|
|
||||||
if (! empty($match[1])) {
|
if (! empty($match[1])) {
|
||||||
return strtolower(trim($match[1]));
|
return strtolower(trim($match[1]));
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
10
index.php
10
index.php
|
@ -1425,18 +1425,12 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history, $sessionManager)
|
||||||
// If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.)
|
// If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.)
|
||||||
if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) {
|
if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) {
|
||||||
// Short timeout to keep the application responsive
|
// Short timeout to keep the application responsive
|
||||||
list($headers, $content) = get_http_response($url, 4);
|
// The callback will fill $charset and $title with data from the downloaded page.
|
||||||
if (strpos($headers[0], '200 OK') !== false) {
|
get_http_response($url, 25, 4194304, get_curl_download_callback($charset, $title));
|
||||||
// Retrieve charset.
|
|
||||||
$charset = get_charset($headers, $content);
|
|
||||||
// Extract title.
|
|
||||||
$title = html_extract_title($content);
|
|
||||||
// Re-encode title in utf-8 if necessary.
|
|
||||||
if (! empty($title) && strtolower($charset) != 'utf-8') {
|
if (! empty($title) && strtolower($charset) != 'utf-8') {
|
||||||
$title = mb_convert_encoding($title, 'utf-8', $charset);
|
$title = mb_convert_encoding($title, 'utf-8', $charset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if ($url == '') {
|
if ($url == '') {
|
||||||
$url = '?' . smallHash($linkdate . $LINKSDB->getNextId());
|
$url = '?' . smallHash($linkdate . $LINKSDB->getNextId());
|
||||||
|
|
|
@ -28,28 +28,14 @@ public function testHtmlExtractNonExistentTitle()
|
||||||
$this->assertFalse(html_extract_title($html));
|
$this->assertFalse(html_extract_title($html));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Test get_charset() with all priorities.
|
|
||||||
*/
|
|
||||||
public function testGetCharset()
|
|
||||||
{
|
|
||||||
$headers = array('Content-Type' => 'text/html; charset=Headers');
|
|
||||||
$html = '<html><meta>stuff</meta><meta charset="Html"/></html>';
|
|
||||||
$default = 'default';
|
|
||||||
$this->assertEquals('headers', get_charset($headers, $html, $default));
|
|
||||||
$this->assertEquals('html', get_charset(array(), $html, $default));
|
|
||||||
$this->assertEquals($default, get_charset(array(), '', $default));
|
|
||||||
$this->assertEquals('utf-8', get_charset(array(), ''));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test headers_extract_charset() when the charset is found.
|
* Test headers_extract_charset() when the charset is found.
|
||||||
*/
|
*/
|
||||||
public function testHeadersExtractExistentCharset()
|
public function testHeadersExtractExistentCharset()
|
||||||
{
|
{
|
||||||
$charset = 'x-MacCroatian';
|
$charset = 'x-MacCroatian';
|
||||||
$headers = array('Content-Type' => 'text/html; charset='. $charset);
|
$headers = 'text/html; charset='. $charset;
|
||||||
$this->assertEquals(strtolower($charset), headers_extract_charset($headers));
|
$this->assertEquals(strtolower($charset), header_extract_charset($headers));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -57,11 +43,11 @@ public function testHeadersExtractExistentCharset()
|
||||||
*/
|
*/
|
||||||
public function testHeadersExtractNonExistentCharset()
|
public function testHeadersExtractNonExistentCharset()
|
||||||
{
|
{
|
||||||
$headers = array();
|
$headers = '';
|
||||||
$this->assertFalse(headers_extract_charset($headers));
|
$this->assertFalse(header_extract_charset($headers));
|
||||||
|
|
||||||
$headers = array('Content-Type' => 'text/html');
|
$headers = 'text/html';
|
||||||
$this->assertFalse(headers_extract_charset($headers));
|
$this->assertFalse(header_extract_charset($headers));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -85,6 +71,131 @@ public function testHtmlExtractNonExistentCharset()
|
||||||
$this->assertFalse(html_extract_charset($html));
|
$this->assertFalse(html_extract_charset($html));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the download callback with valid value
|
||||||
|
*/
|
||||||
|
public function testCurlDownloadCallbackOk()
|
||||||
|
{
|
||||||
|
$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
|
||||||
|
$data = [
|
||||||
|
'HTTP/1.1 200 OK',
|
||||||
|
'Server: GitHub.com',
|
||||||
|
'Date: Sat, 28 Oct 2017 12:01:33 GMT',
|
||||||
|
'Content-Type: text/html; charset=utf-8',
|
||||||
|
'Status: 200 OK',
|
||||||
|
'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
|
||||||
|
'<title>ignored</title>',
|
||||||
|
];
|
||||||
|
foreach ($data as $key => $line) {
|
||||||
|
$ignore = null;
|
||||||
|
$expected = $key !== 'end' ? strlen($line) : false;
|
||||||
|
$this->assertEquals($expected, $callback($ignore, $line));
|
||||||
|
if ($expected === false) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$this->assertEquals('utf-8', $charset);
|
||||||
|
$this->assertEquals('Refactoring · GitHub', $title);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the download callback with valid values and no charset
|
||||||
|
*/
|
||||||
|
public function testCurlDownloadCallbackOkNoCharset()
|
||||||
|
{
|
||||||
|
$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
|
||||||
|
$data = [
|
||||||
|
'HTTP/1.1 200 OK',
|
||||||
|
'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
|
||||||
|
'<title>ignored</title>',
|
||||||
|
];
|
||||||
|
foreach ($data as $key => $line) {
|
||||||
|
$ignore = null;
|
||||||
|
$this->assertEquals(strlen($line), $callback($ignore, $line));
|
||||||
|
}
|
||||||
|
$this->assertEmpty($charset);
|
||||||
|
$this->assertEquals('Refactoring · GitHub', $title);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the download callback with valid values and no charset
|
||||||
|
*/
|
||||||
|
public function testCurlDownloadCallbackOkHtmlCharset()
|
||||||
|
{
|
||||||
|
$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
|
||||||
|
$data = [
|
||||||
|
'HTTP/1.1 200 OK',
|
||||||
|
'<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
|
||||||
|
'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
|
||||||
|
'<title>ignored</title>',
|
||||||
|
];
|
||||||
|
foreach ($data as $key => $line) {
|
||||||
|
$ignore = null;
|
||||||
|
$expected = $key !== 'end' ? strlen($line) : false;
|
||||||
|
$this->assertEquals($expected, $callback($ignore, $line));
|
||||||
|
if ($expected === false) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$this->assertEquals('utf-8', $charset);
|
||||||
|
$this->assertEquals('Refactoring · GitHub', $title);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the download callback with valid values and no title
|
||||||
|
*/
|
||||||
|
public function testCurlDownloadCallbackOkNoTitle()
|
||||||
|
{
|
||||||
|
$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
|
||||||
|
$data = [
|
||||||
|
'HTTP/1.1 200 OK',
|
||||||
|
'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
|
||||||
|
'ignored',
|
||||||
|
];
|
||||||
|
foreach ($data as $key => $line) {
|
||||||
|
$ignore = null;
|
||||||
|
$this->assertEquals(strlen($line), $callback($ignore, $line));
|
||||||
|
}
|
||||||
|
$this->assertEquals('utf-8', $charset);
|
||||||
|
$this->assertEmpty($title);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the download callback with an invalid content type.
|
||||||
|
*/
|
||||||
|
public function testCurlDownloadCallbackInvalidContentType()
|
||||||
|
{
|
||||||
|
$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ct_ko');
|
||||||
|
$ignore = null;
|
||||||
|
$this->assertFalse($callback($ignore, ''));
|
||||||
|
$this->assertEmpty($charset);
|
||||||
|
$this->assertEmpty($title);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the download callback with an invalid response code.
|
||||||
|
*/
|
||||||
|
public function testCurlDownloadCallbackInvalidResponseCode()
|
||||||
|
{
|
||||||
|
$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rc_ko');
|
||||||
|
$ignore = null;
|
||||||
|
$this->assertFalse($callback($ignore, ''));
|
||||||
|
$this->assertEmpty($charset);
|
||||||
|
$this->assertEmpty($title);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the download callback with an invalid content type and response code.
|
||||||
|
*/
|
||||||
|
public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode()
|
||||||
|
{
|
||||||
|
$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rs_ct_ko');
|
||||||
|
$ignore = null;
|
||||||
|
$this->assertFalse($callback($ignore, ''));
|
||||||
|
$this->assertEmpty($charset);
|
||||||
|
$this->assertEmpty($title);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test count_private.
|
* Test count_private.
|
||||||
*/
|
*/
|
||||||
|
@ -207,3 +318,96 @@ private function getHashtagLink($hashtag, $index = '')
|
||||||
return str_replace('$1', $hashtag, $hashtagLink);
|
return str_replace('$1', $hashtag, $hashtagLink);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// old style mock: PHPUnit doesn't allow function mock
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns code 200 or html content type.
|
||||||
|
*
|
||||||
|
* @param resource $ch cURL resource
|
||||||
|
* @param int $type cURL info type
|
||||||
|
*
|
||||||
|
* @return int|string 200 or 'text/html'
|
||||||
|
*/
|
||||||
|
function ut_curl_getinfo_ok($ch, $type)
|
||||||
|
{
|
||||||
|
switch ($type) {
|
||||||
|
case CURLINFO_RESPONSE_CODE:
|
||||||
|
return 200;
|
||||||
|
case CURLINFO_CONTENT_TYPE:
|
||||||
|
return 'text/html; charset=utf-8';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns code 200 or html content type without charset.
|
||||||
|
*
|
||||||
|
* @param resource $ch cURL resource
|
||||||
|
* @param int $type cURL info type
|
||||||
|
*
|
||||||
|
* @return int|string 200 or 'text/html'
|
||||||
|
*/
|
||||||
|
function ut_curl_getinfo_no_charset($ch, $type)
|
||||||
|
{
|
||||||
|
switch ($type) {
|
||||||
|
case CURLINFO_RESPONSE_CODE:
|
||||||
|
return 200;
|
||||||
|
case CURLINFO_CONTENT_TYPE:
|
||||||
|
return 'text/html';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invalid response code.
|
||||||
|
*
|
||||||
|
* @param resource $ch cURL resource
|
||||||
|
* @param int $type cURL info type
|
||||||
|
*
|
||||||
|
* @return int|string 404 or 'text/html'
|
||||||
|
*/
|
||||||
|
function ut_curl_getinfo_rc_ko($ch, $type)
|
||||||
|
{
|
||||||
|
switch ($type) {
|
||||||
|
case CURLINFO_RESPONSE_CODE:
|
||||||
|
return 404;
|
||||||
|
case CURLINFO_CONTENT_TYPE:
|
||||||
|
return 'text/html; charset=utf-8';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invalid content type.
|
||||||
|
*
|
||||||
|
* @param resource $ch cURL resource
|
||||||
|
* @param int $type cURL info type
|
||||||
|
*
|
||||||
|
* @return int|string 200 or 'text/plain'
|
||||||
|
*/
|
||||||
|
function ut_curl_getinfo_ct_ko($ch, $type)
|
||||||
|
{
|
||||||
|
switch ($type) {
|
||||||
|
case CURLINFO_RESPONSE_CODE:
|
||||||
|
return 200;
|
||||||
|
case CURLINFO_CONTENT_TYPE:
|
||||||
|
return 'text/plain';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invalid response code and content type.
|
||||||
|
*
|
||||||
|
* @param resource $ch cURL resource
|
||||||
|
* @param int $type cURL info type
|
||||||
|
*
|
||||||
|
* @return int|string 404 or 'text/plain'
|
||||||
|
*/
|
||||||
|
function ut_curl_getinfo_rs_ct_ko($ch, $type)
|
||||||
|
{
|
||||||
|
switch ($type) {
|
||||||
|
case CURLINFO_RESPONSE_CODE:
|
||||||
|
return 404;
|
||||||
|
case CURLINFO_CONTENT_TYPE:
|
||||||
|
return 'text/plain';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue