From 919c9803443d5b05623fb34e755c85e1e3c22d91 Mon Sep 17 00:00:00 2001 From: nodiscc Date: Thu, 19 Oct 2017 18:06:07 +0200 Subject: [PATCH 01/55] documentation: update tag cloud/filtering doc Ref. https://github.com/shaarli/Shaarli/issues/959 --- doc/md/Browsing-and-searching.md | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/md/Browsing-and-searching.md b/doc/md/Browsing-and-searching.md index 3570748..2448313 100644 --- a/doc/md/Browsing-and-searching.md +++ b/doc/md/Browsing-and-searching.md @@ -14,10 +14,24 @@ Use the `Filter by tags` field to restrict displayed links to entries tagged wit **Hidden tags:** Tags starting with a dot `.` (example `.secret`) are private. They can only be seen and searched when logged in. -Alternatively you can use the `Tag cloud` to discover all tags and click on any of them to display related links. +### Tag cloud -To search for links that are not tagged, enter `""` in the tag search field. +The `Tag cloud` page diplays a "cloud" view of all tags in your Shaarli. + + * More frequently used tags are displayed with a bigger font size. + * When sorting by `Most used` or `Alphabetical`, tags are displayed as a _list_, along with counters and edit/delete button for each tag. + * Clicking on any tag will display a list of all Shaares matching this tag. + * Clicking on the counter next to a tag `example` , will filter the tag cloud to only display tags found in Shaares tagged `example`. Repeat this any number of times to further filter the tag cloud. Click `List all links with those tags` to display Shaares matching your current tag filter. ## Filtering RSS feeds/Picture wall RSS feeds can also be restricted to only return items matching a text/tag search: see [RSS feeds](RSS feeds). + +## Filter buttons + +Filter buttons can be found at the top left of the link list. They allow you to apply different filters to the list: + + * **Private links:** When this toggle button is enabled, only shaares set to `private` will be shown. + * **Untagged links:** When the this toggle button is enabled (top left of the link list), only shaares _without any tags_ will be shown in the link list. + +Filter buttons are only available when logged in. From d65342e304f92643ba922200953cfebc51e1e482 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Sat, 30 Sep 2017 11:04:13 +0200 Subject: [PATCH 02/55] Extract the title/charset during page download, and check content type Use CURLOPT_WRITEFUNCTION to check the response code and content type (only allow HTML). Also extract the title and charset during downloading chunk of data, and stop it when everything has been extracted. Closes #579 --- application/HttpUtils.php | 14 ++- application/LinkUtils.php | 89 ++++++++------ index.php | 14 +-- tests/LinkUtilsTest.php | 244 ++++++++++++++++++++++++++++++++++---- 4 files changed, 293 insertions(+), 68 deletions(-) diff --git a/application/HttpUtils.php b/application/HttpUtils.php index 0083596..2edf5ce 100644 --- a/application/HttpUtils.php +++ b/application/HttpUtils.php @@ -3,9 +3,11 @@ * GET an HTTP URL to retrieve its content * Uses the cURL library or a fallback method * - * @param string $url URL to get (http://...) - * @param int $timeout network timeout (in seconds) - * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) + * @param string $url URL to get (http://...) + * @param int $timeout network timeout (in seconds) + * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) + * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). + * Can be used to add download conditions on the headers (response code, content type, etc.). * * @return array HTTP response headers, downloaded content * @@ -29,7 +31,7 @@ * @see http://stackoverflow.com/q/9183178 * @see http://stackoverflow.com/q/1462720 */ -function get_http_response($url, $timeout = 30, $maxBytes = 4194304) +function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) { $urlObj = new Url($url); $cleanUrl = $urlObj->idnToAscii(); @@ -75,6 +77,10 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304) curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); + if (is_callable($curlWriteFunction)) { + curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); + } + // Max download size management curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024); curl_setopt($ch, CURLOPT_NOPROGRESS, false); diff --git a/application/LinkUtils.php b/application/LinkUtils.php index 976474d..c0dd32a 100644 --- a/application/LinkUtils.php +++ b/application/LinkUtils.php @@ -1,5 +1,54 @@ ). - * 3. Use a default charset (default: UTF-8). + * Extract charset from HTTP header if it's defined. * - * @param array $headers HTTP headers array. - * @param string $htmlContent HTML content where to look for charset. - * @param string $defaultCharset Default charset to apply if other methods failed. - * - * @return string Determined charset. - */ -function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8') -{ - if ($charset = headers_extract_charset($headers)) { - return $charset; - } - - if ($charset = html_extract_charset($htmlContent)) { - return $charset; - } - - return $defaultCharset; -} - -/** - * Extract charset from HTTP headers if it's defined. - * - * @param array $headers HTTP headers array. + * @param string $header HTTP header Content-Type line. * * @return bool|string Charset string if found (lowercase), false otherwise. */ -function headers_extract_charset($headers) +function header_extract_charset($header) { - if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) { - preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match); - if (! empty($match[1])) { - return strtolower(trim($match[1])); - } + preg_match('/charset="?([^; ]+)/i', $header, $match); + if (! empty($match[1])) { + return strtolower(trim($match[1])); } return false; diff --git a/index.php b/index.php index fb00a9f..ac51038 100644 --- a/index.php +++ b/index.php @@ -1428,16 +1428,10 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history) // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.) if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) { // Short timeout to keep the application responsive - list($headers, $content) = get_http_response($url, 4); - if (strpos($headers[0], '200 OK') !== false) { - // Retrieve charset. - $charset = get_charset($headers, $content); - // Extract title. - $title = html_extract_title($content); - // Re-encode title in utf-8 if necessary. - if (! empty($title) && strtolower($charset) != 'utf-8') { - $title = mb_convert_encoding($title, 'utf-8', $charset); - } + // The callback will fill $charset and $title with data from the downloaded page. + get_http_response($url, 25, 4194304, get_curl_download_callback($charset, $title)); + if (! empty($title) && strtolower($charset) != 'utf-8') { + $title = mb_convert_encoding($title, 'utf-8', $charset); } } diff --git a/tests/LinkUtilsTest.php b/tests/LinkUtilsTest.php index 7c0d4b0..ef650f4 100644 --- a/tests/LinkUtilsTest.php +++ b/tests/LinkUtilsTest.php @@ -28,28 +28,14 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase $this->assertFalse(html_extract_title($html)); } - /** - * Test get_charset() with all priorities. - */ - public function testGetCharset() - { - $headers = array('Content-Type' => 'text/html; charset=Headers'); - $html = 'stuff'; - $default = 'default'; - $this->assertEquals('headers', get_charset($headers, $html, $default)); - $this->assertEquals('html', get_charset(array(), $html, $default)); - $this->assertEquals($default, get_charset(array(), '', $default)); - $this->assertEquals('utf-8', get_charset(array(), '')); - } - /** * Test headers_extract_charset() when the charset is found. */ public function testHeadersExtractExistentCharset() { $charset = 'x-MacCroatian'; - $headers = array('Content-Type' => 'text/html; charset='. $charset); - $this->assertEquals(strtolower($charset), headers_extract_charset($headers)); + $headers = 'text/html; charset='. $charset; + $this->assertEquals(strtolower($charset), header_extract_charset($headers)); } /** @@ -57,11 +43,11 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase */ public function testHeadersExtractNonExistentCharset() { - $headers = array(); - $this->assertFalse(headers_extract_charset($headers)); + $headers = ''; + $this->assertFalse(header_extract_charset($headers)); - $headers = array('Content-Type' => 'text/html'); - $this->assertFalse(headers_extract_charset($headers)); + $headers = 'text/html'; + $this->assertFalse(header_extract_charset($headers)); } /** @@ -85,6 +71,131 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase $this->assertFalse(html_extract_charset($html)); } + /** + * Test the download callback with valid value + */ + public function testCurlDownloadCallbackOk() + { + $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok'); + $data = [ + 'HTTP/1.1 200 OK', + 'Server: GitHub.com', + 'Date: Sat, 28 Oct 2017 12:01:33 GMT', + 'Content-Type: text/html; charset=utf-8', + 'Status: 200 OK', + 'end' => 'th=device-width">Refactoring · GitHubRefactoring · GitHub', + 'end' => 'th=device-width">Refactoring · GitHubRefactoring · GitHub
- + {$shaarlititle} @@ -12,32 +12,32 @@