diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index 35a5b290..77eb2d95 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php @@ -7,13 +7,25 @@ * * @param string $charset to extract from the downloaded page (reference) * @param string $title to extract from the downloaded page (reference) + * @param string $description to extract from the downloaded page (reference) + * @param string $keywords to extract from the downloaded page (reference) + * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content * @param string $curlGetInfo Optionally overrides curl_getinfo function * * @return Closure */ -function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo') -{ +function get_curl_download_callback( + &$charset, + &$title, + &$description, + &$keywords, + $retrieveDescription, + $curlGetInfo = 'curl_getinfo' +) { $isRedirected = false; + $currentChunk = 0; + $foundChunk = null; + /** * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). * @@ -25,7 +37,18 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get * * @return int|bool length of $data or false if we need to stop the download */ - return function (&$ch, $data) use ($curlGetInfo, &$charset, &$title, &$isRedirected) { + return function (&$ch, $data) use ( + $retrieveDescription, + $curlGetInfo, + &$charset, + &$title, + &$description, + &$keywords, + &$isRedirected, + &$currentChunk, + &$foundChunk + ) { + $currentChunk++; $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); if (!empty($responseCode) && in_array($responseCode, [301, 302])) { $isRedirected = true; @@ -50,9 +73,34 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get } if (empty($title)) { $title = html_extract_title($data); + $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; } + if ($retrieveDescription && empty($description)) { + $description = html_extract_tag('description', $data); + $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; + } + if ($retrieveDescription && empty($keywords)) { + $keywords = html_extract_tag('keywords', $data); + if (! empty($keywords)) { + $foundChunk = $currentChunk; + // Keywords use the format tag1, tag2 multiple words, tag + // So we format them to match Shaarli's separator and glue multiple words with '-' + $keywords = implode(' ', array_map(function($keyword) { + return implode('-', preg_split('/\s+/', trim($keyword))); + }, explode(',', $keywords))); + } + } + // We got everything we want, stop the download. - if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) { + // If we already found either the title, description or keywords, + // it's highly unlikely that we'll found the other metas further than + // in the same chunk of data or the next one. So we also stop the download after that. + if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null + && (! $retrieveDescription + || $foundChunk < $currentChunk + || (!empty($title) && !empty($description) && !empty($keywords)) + ) + ) { return false; } @@ -110,6 +158,35 @@ function html_extract_charset($html) return false; } +/** + * Extract meta tag from HTML content in either: + * - OpenGraph: + * - Meta tag: + * + * @param string $tag Name of the tag to retrieve. + * @param string $html HTML content where to look for charset. + * + * @return bool|string Charset string if found, false otherwise. + */ +function html_extract_tag($tag, $html) +{ + $propertiesKey = ['property', 'name', 'itemprop']; + $properties = implode('|', $propertiesKey); + // Try to retrieve OpenGraph image. + $ogRegex = '#]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#'; + // If the attributes are not in the order property => content (e.g. Github) + // New regex to keep this readable... more or less. + $ogRegexReverse = '#]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#'; + + if (preg_match($ogRegex, $html, $matches) > 0 + || preg_match($ogRegexReverse, $html, $matches) > 0 + ) { + return $matches[1]; + } + + return false; +} + /** * Count private links in given linklist. * diff --git a/application/config/ConfigManager.php b/application/config/ConfigManager.php index 30993928..c95e6800 100644 --- a/application/config/ConfigManager.php +++ b/application/config/ConfigManager.php @@ -365,6 +365,7 @@ protected function setDefaultValues() $this->setEmpty('general.links_per_page', 20); $this->setEmpty('general.enabled_plugins', self::$DEFAULT_PLUGINS); $this->setEmpty('general.default_note_title', 'Note: '); + $this->setEmpty('general.retrieve_description', false); $this->setEmpty('updates.check_updates', false); $this->setEmpty('updates.check_updates_branch', 'stable'); diff --git a/doc/md/Shaarli-configuration.md b/doc/md/Shaarli-configuration.md index a931ab1e..664e36dd 100644 --- a/doc/md/Shaarli-configuration.md +++ b/doc/md/Shaarli-configuration.md @@ -56,6 +56,8 @@ _These settings should not be edited_ - **timezone**: See [the list of supported timezones](http://php.net/manual/en/timezones.php). - **enabled_plugins**: List of enabled plugins. - **default_note_title**: Default title of a new note. +- **retrieve_description** (boolean): If set to true, for every new links Shaarli will try +to retrieve the description and keywords from the HTML meta tags. ### Security diff --git a/inc/languages/fr/LC_MESSAGES/shaarli.po b/inc/languages/fr/LC_MESSAGES/shaarli.po index c2c73b29..611296f1 100644 --- a/inc/languages/fr/LC_MESSAGES/shaarli.po +++ b/inc/languages/fr/LC_MESSAGES/shaarli.po @@ -1,8 +1,8 @@ msgid "" msgstr "" "Project-Id-Version: Shaarli\n" -"POT-Creation-Date: 2019-05-25 16:37+0200\n" -"PO-Revision-Date: 2019-05-25 16:37+0200\n" +"POT-Creation-Date: 2019-07-06 12:14+0200\n" +"PO-Revision-Date: 2019-07-06 12:17+0200\n" "Last-Translator: \n" "Language-Team: Shaarli\n" "Language: fr_FR\n" @@ -252,7 +252,7 @@ msgstr "404 Introuvable" msgid "Couldn't retrieve updater class methods." msgstr "Impossible de récupérer les méthodes de la classe Updater." -#: application/updater/Updater.php:526 index.php:1033 +#: application/updater/Updater.php:526 index.php:1034 msgid "" "You have enabled or changed thumbnails mode. Please synchronize them." @@ -337,8 +337,8 @@ msgid "You are not supposed to change a password on an Open Shaarli." msgstr "" "Vous n'êtes pas censé modifier le mot de passe d'un Shaarli en mode ouvert." -#: index.php:957 index.php:1007 index.php:1092 index.php:1122 index.php:1232 -#: index.php:1279 +#: index.php:957 index.php:1007 index.php:1094 index.php:1124 index.php:1234 +#: index.php:1281 msgid "Wrong token." msgstr "Jeton invalide." @@ -356,64 +356,64 @@ msgstr "Votre mot de passe a été modifié" msgid "Change password" msgstr "Modifier le mot de passe" -#: index.php:1053 +#: index.php:1054 msgid "Configuration was saved." msgstr "La configuration a été sauvegardée." -#: index.php:1076 tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:24 +#: index.php:1078 tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:24 msgid "Configure" msgstr "Configurer" -#: index.php:1086 tmp/changetag.b91ef64efc3688266305ea9b42e5017e.rtpl.php:13 +#: index.php:1088 tmp/changetag.b91ef64efc3688266305ea9b42e5017e.rtpl.php:13 #: tmp/tools.b91ef64efc3688266305ea9b42e5017e.rtpl.php:36 msgid "Manage tags" msgstr "Gérer les tags" -#: index.php:1105 +#: index.php:1107 #, php-format msgid "The tag was removed from %d link." msgid_plural "The tag was removed from %d links." msgstr[0] "Le tag a été supprimé de %d lien." msgstr[1] "Le tag a été supprimé de %d liens." -#: index.php:1106 +#: index.php:1108 #, php-format msgid "The tag was renamed in %d link." msgid_plural "The tag was renamed in %d links." msgstr[0] "Le tag a été renommé dans %d lien." msgstr[1] "Le tag a été renommé dans %d liens." -#: index.php:1113 tmp/addlink.b91ef64efc3688266305ea9b42e5017e.rtpl.php:13 +#: index.php:1115 tmp/addlink.b91ef64efc3688266305ea9b42e5017e.rtpl.php:13 msgid "Shaare a new link" msgstr "Partager un nouveau lien" -#: index.php:1342 tmp/linklist.b91ef64efc3688266305ea9b42e5017e.rtpl.php:169 +#: index.php:1344 tmp/linklist.b91ef64efc3688266305ea9b42e5017e.rtpl.php:169 msgid "Edit" msgstr "Modifier" -#: index.php:1342 index.php:1413 +#: index.php:1344 index.php:1416 #: tmp/page.header.b91ef64efc3688266305ea9b42e5017e.rtpl.php:26 #: tmp/page.header.cedf684561d925457130839629000a81.rtpl.php:26 msgid "Shaare" msgstr "Shaare" -#: index.php:1382 +#: index.php:1385 msgid "Note: " msgstr "Note : " -#: index.php:1421 +#: index.php:1424 msgid "Invalid link ID provided" msgstr "" -#: index.php:1441 tmp/export.b91ef64efc3688266305ea9b42e5017e.rtpl.php:65 +#: index.php:1444 tmp/export.b91ef64efc3688266305ea9b42e5017e.rtpl.php:65 msgid "Export" msgstr "Exporter" -#: index.php:1503 tmp/import.b91ef64efc3688266305ea9b42e5017e.rtpl.php:83 +#: index.php:1506 tmp/import.b91ef64efc3688266305ea9b42e5017e.rtpl.php:83 msgid "Import" msgstr "Importer" -#: index.php:1513 +#: index.php:1516 #, php-format msgid "" "The file you are trying to upload is probably bigger than what this " @@ -423,20 +423,20 @@ msgstr "" "le serveur web peut accepter (%s). Merci de l'envoyer en parties plus " "légères." -#: index.php:1558 tmp/pluginsadmin.b91ef64efc3688266305ea9b42e5017e.rtpl.php:26 +#: index.php:1561 tmp/pluginsadmin.b91ef64efc3688266305ea9b42e5017e.rtpl.php:26 #: tmp/tools.b91ef64efc3688266305ea9b42e5017e.rtpl.php:22 msgid "Plugin administration" msgstr "Administration des plugins" -#: index.php:1612 tmp/thumbnails.b91ef64efc3688266305ea9b42e5017e.rtpl.php:14 +#: index.php:1615 tmp/thumbnails.b91ef64efc3688266305ea9b42e5017e.rtpl.php:14 msgid "Thumbnails update" msgstr "Mise à jour des miniatures" -#: index.php:1778 +#: index.php:1781 msgid "Search: " msgstr "Recherche : " -#: index.php:1821 +#: index.php:1824 #, php-format msgid "" "
Sessions do not seem to work correctly on your server.
Make sure the " @@ -455,7 +455,7 @@ msgstr "" "des cookies. Nous vous recommandons d'accéder à votre serveur depuis son " "adresse IP ou un Fully Qualified Domain Name.
" -#: index.php:1831 +#: index.php:1834 msgid "Click to try again." msgstr "Cliquer ici pour réessayer." @@ -592,7 +592,7 @@ msgstr "Mauvaise réponse du hub %s" msgid "Enable PubSubHubbub feed publishing." msgstr "Active la publication de flux vers PubSubHubbub." -#: plugins/qrcode/qrcode.php:73 plugins/wallabag/wallabag.php:68 +#: plugins/qrcode/qrcode.php:72 plugins/wallabag/wallabag.php:68 msgid "For each link, add a QRCode icon." msgstr "Pour chaque lien, ajouter une icône de QRCode." @@ -679,6 +679,34 @@ msgstr "Vous pouvez aussi modifier les tags dans la" msgid "tag list" msgstr "liste des tags" +#: tmp/configure.90100d2eaf5d3705e14b9b4f78ecddc9.rtpl.php:143 +#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:312 +#: tmp/export.b91ef64efc3688266305ea9b42e5017e.rtpl.php:31 +msgid "All" +msgstr "Tous" + +#: tmp/configure.90100d2eaf5d3705e14b9b4f78ecddc9.rtpl.php:147 +#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:316 +msgid "Only common media hosts" +msgstr "Seulement les hébergeurs de média connus" + +#: tmp/configure.90100d2eaf5d3705e14b9b4f78ecddc9.rtpl.php:151 +#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:320 +msgid "None" +msgstr "Aucune" + +#: tmp/configure.90100d2eaf5d3705e14b9b4f78ecddc9.rtpl.php:158 +#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:297 +msgid "You need to enable the extensionphp-gd
to use thumbnails." +msgstr "" +"Vous devez activer l'extensionphp-gd
pour utiliser les " +"miniatures." + +#: tmp/configure.90100d2eaf5d3705e14b9b4f78ecddc9.rtpl.php:162 +#| msgid "Enable thumbnails" +msgid "Synchonize thumbnails" +msgstr "" + #: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:29 msgid "title" msgstr "titre" @@ -762,50 +790,41 @@ msgid "Notify me when a new release is ready" msgstr "Me notifier lorsqu'une nouvelle version est disponible" #: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:247 +msgid "Automatically retrieve description for new bookmarks" +msgstr "Récupérer automatiquement la description" + +#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:248 +msgid "Shaarli will try to retrieve the description from meta HTML headers" +msgstr "" +"Shaarli essaiera de récupérer la description depuis les balises HTML meta " +"dans les entêtes" + +#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:263 #: tmp/install.b91ef64efc3688266305ea9b42e5017e.rtpl.php:169 msgid "Enable REST API" msgstr "Activer l'API REST" -#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:248 +#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:264 #: tmp/install.b91ef64efc3688266305ea9b42e5017e.rtpl.php:170 msgid "Allow third party software to use Shaarli such as mobile application" msgstr "" "Permet aux applications tierces d'utiliser Shaarli, par exemple les " "applications mobiles" -#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:263 +#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:279 msgid "API secret" msgstr "Clé d'API secrète" -#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:277 +#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:293 msgid "Enable thumbnails" msgstr "Activer les miniatures" -#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:281 -msgid "You need to enable the extensionphp-gd
to use thumbnails." -msgstr "" -"Vous devez activer l'extensionphp-gd
pour utiliser les " -"miniatures." - -#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:285 +#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:301 #: tmp/tools.b91ef64efc3688266305ea9b42e5017e.rtpl.php:56 msgid "Synchronize thumbnails" msgstr "Synchroniser les miniatures" -#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:296 -#: tmp/export.b91ef64efc3688266305ea9b42e5017e.rtpl.php:31 -msgid "All" -msgstr "Tous" - -#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:300 -msgid "Only common media hosts" -msgstr "Seulement les hébergeurs de média connus" - -#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:304 -msgid "None" -msgstr "Aucune" - -#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:312 +#: tmp/configure.b91ef64efc3688266305ea9b42e5017e.rtpl.php:328 #: tmp/editlink.b91ef64efc3688266305ea9b42e5017e.rtpl.php:72 #: tmp/pluginsadmin.b91ef64efc3688266305ea9b42e5017e.rtpl.php:139 #: tmp/pluginsadmin.b91ef64efc3688266305ea9b42e5017e.rtpl.php:199 @@ -1149,17 +1168,13 @@ msgstr "Déconnexion" #: tmp/page.header.b91ef64efc3688266305ea9b42e5017e.rtpl.php:150 #: tmp/page.header.cedf684561d925457130839629000a81.rtpl.php:150 -#, fuzzy -#| msgid "Public" msgid "Set public" -msgstr "Publics" +msgstr "Rendre public" #: tmp/page.header.b91ef64efc3688266305ea9b42e5017e.rtpl.php:155 #: tmp/page.header.cedf684561d925457130839629000a81.rtpl.php:155 -#, fuzzy -#| msgid "Private" msgid "Set private" -msgstr "Privé" +msgstr "Rendre privé" #: tmp/page.header.b91ef64efc3688266305ea9b42e5017e.rtpl.php:187 #: tmp/page.header.cedf684561d925457130839629000a81.rtpl.php:187 @@ -1409,11 +1424,6 @@ msgstr "" "Glisser ce lien dans votre barre de favoris ou cliquer droit dessus et « " "Ajouter aux favoris »" -#, fuzzy -#~| msgid "Enable thumbnails" -#~ msgid "Synchonize thumbnails" -#~ msgstr "Activer les miniatures" - #~ msgid "" #~ "You need to browse your Shaarli over HTTPS to use this " #~ "functionality." diff --git a/index.php b/index.php index a14616ed..957d8d9a 100644 --- a/index.php +++ b/index.php @@ -1015,6 +1015,7 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history, $sessionManager, $conf->set('general.timezone', $tz); $conf->set('general.title', escape($_POST['title'])); $conf->set('general.header_link', escape($_POST['titleLink'])); + $conf->set('general.retrieve_description', !empty($_POST['retrieveDescription'])); $conf->set('resource.theme', escape($_POST['theme'])); $conf->set('security.session_protection_disabled', !empty($_POST['disablesessionprotection'])); $conf->set('privacy.default_private_links', !empty($_POST['privateLinkByDefault'])); @@ -1063,6 +1064,7 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history, $sessionManager, ); $PAGE->assign('continents', $continents); $PAGE->assign('cities', $cities); + $PAGE->assign('retrieve_description', $conf->get('general.retrieve_description')); $PAGE->assign('private_links_default', $conf->get('privacy.default_private_links', false)); $PAGE->assign('session_protection_disabled', $conf->get('security.session_protection_disabled', false)); $PAGE->assign('enable_rss_permalinks', $conf->get('feed.rss_permalinks', false)); @@ -1364,13 +1366,14 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history, $sessionManager, // If this is an HTTP(S) link, we try go get the page to extract // the title (otherwise we will to straight to the edit form.) if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) { + $retrieveDescription = $conf->get('general.retrieve_description'); // Short timeout to keep the application responsive // The callback will fill $charset and $title with data from the downloaded page. get_http_response( $url, $conf->get('general.download_timeout', 30), $conf->get('general.download_max_size', 4194304), - get_curl_download_callback($charset, $title) + get_curl_download_callback($charset, $title, $description, $tags, $retrieveDescription) ); if (! empty($title) && strtolower($charset) != 'utf-8') { $title = mb_convert_encoding($title, 'utf-8', $charset); diff --git a/tests/bookmark/LinkUtilsTest.php b/tests/bookmark/LinkUtilsTest.php index 25fb3043..78cb8f2a 100644 --- a/tests/bookmark/LinkUtilsTest.php +++ b/tests/bookmark/LinkUtilsTest.php @@ -2,14 +2,16 @@ namespace Shaarli\Bookmark; +use PHPUnit\Framework\TestCase; use ReferenceLinkDB; +use Shaarli\Config\ConfigManager; require_once 'tests/utils/CurlUtils.php'; /** * Class LinkUtilsTest. */ -class LinkUtilsTest extends \PHPUnit\Framework\TestCase +class LinkUtilsTest extends TestCase { /** * Test html_extract_title() when the title is found. @@ -75,12 +77,57 @@ public function testHtmlExtractNonExistentCharset() $this->assertFalse(html_extract_charset($html)); } + /** + * Test html_extract_tag() when the tag stuff2