Feature: highlight fulltext search results

How it works:

  1. when a fulltext search is made, Shaarli looks for the first
occurence position of every term matching the search. No change here,
but we store these positions in an array, in Bookmark's additionalContent.
  2. when formatting bookmarks (through BookmarkFormatter
implementation):
    1. first we insert specific tokens at every search result positions
    2. we format the content (escape HTML, apply markdown, etc.)
    3. as a last step, we replace our token with displayable span
elements

Cons: this tightens coupling between search filters and formatters
Pros: it was absolutely necessary not to perform the
search twice. this solution has close to no impact on performances.

Fixes #205
This commit is contained in:
ArthurHoaro 2020-10-12 11:35:55 +02:00
parent 64cac25626
commit 4e3875c0ce
7 changed files with 349 additions and 35 deletions

View file

@ -54,6 +54,9 @@ class Bookmark
/** @var bool True if the bookmark can only be seen while logged in */
protected $private;
/** @var mixed[] Available to store any additional content for a bookmark. Currently used for search highlight. */
protected $additionalContent = [];
/**
* Initialize a link from array data. Especially useful to create a Bookmark from former link storage format.
*
@ -95,6 +98,8 @@ public function fromArray(array $data): Bookmark
* - the URL with the permalink
* - the title with the URL
*
* Also make sure that we do not save search highlights in the datastore.
*
* @throws InvalidBookmarkException
*/
public function validate(): void
@ -112,6 +117,9 @@ public function validate(): void
if (empty($this->title)) {
$this->title = $this->url;
}
if (array_key_exists('search_highlight', $this->additionalContent)) {
unset($this->additionalContent['search_highlight']);
}
}
/**
@ -435,6 +443,44 @@ public function setTagsString(?string $tags): Bookmark
return $this;
}
/**
* Get entire additionalContent array.
*
* @return mixed[]
*/
public function getAdditionalContent(): array
{
return $this->additionalContent;
}
/**
* Set a single entry in additionalContent, by key.
*
* @param string $key
* @param mixed|null $value Any type of value can be set.
*
* @return $this
*/
public function addAdditionalContentEntry(string $key, $value): self
{
$this->additionalContent[$key] = $value;
return $this;
}
/**
* Get a single entry in additionalContent, by key.
*
* @param string $key
* @param mixed|null $default
*
* @return mixed|null can be any type or even null.
*/
public function getAdditionalContentEntry(string $key, $default = null)
{
return array_key_exists($key, $this->additionalContent) ? $this->additionalContent[$key] : $default;
}
/**
* Rename a tag in tags list.
*

View file

@ -201,7 +201,7 @@ private function filterFulltext(string $searchterms, string $visibility = 'all')
return $this->noFilter($visibility);
}
$filtered = array();
$filtered = [];
$search = mb_convert_case(html_entity_decode($searchterms), MB_CASE_LOWER, 'UTF-8');
$exactRegex = '/"([^"]+)"/';
// Retrieve exact search terms.
@ -213,8 +213,8 @@ private function filterFulltext(string $searchterms, string $visibility = 'all')
$explodedSearchAnd = array_values(array_filter($explodedSearchAnd));
// Filter excluding terms and update andSearch.
$excludeSearch = array();
$andSearch = array();
$excludeSearch = [];
$andSearch = [];
foreach ($explodedSearchAnd as $needle) {
if ($needle[0] == '-' && strlen($needle) > 1) {
$excludeSearch[] = substr($needle, 1);
@ -234,33 +234,38 @@ private function filterFulltext(string $searchterms, string $visibility = 'all')
}
}
// Concatenate link fields to search across fields.
// Adds a '\' separator for exact search terms.
$content = mb_convert_case($link->getTitle(), MB_CASE_LOWER, 'UTF-8') .'\\';
$content .= mb_convert_case($link->getDescription(), MB_CASE_LOWER, 'UTF-8') .'\\';
$content .= mb_convert_case($link->getUrl(), MB_CASE_LOWER, 'UTF-8') .'\\';
$content .= mb_convert_case($link->getTagsString(), MB_CASE_LOWER, 'UTF-8') .'\\';
$lengths = [];
$content = $this->buildFullTextSearchableLink($link, $lengths);
// Be optimistic
$found = true;
$foundPositions = [];
// First, we look for exact term search
for ($i = 0; $i < count($exactSearch) && $found; $i++) {
$found = strpos($content, $exactSearch[$i]) !== false;
// Then iterate over keywords, if keyword is not found,
// no need to check for the others. We want all or nothing.
foreach ([$exactSearch, $andSearch] as $search) {
for ($i = 0; $i < count($search) && $found !== false; $i++) {
$found = mb_strpos($content, $search[$i]);
if ($found === false) {
break;
}
// Iterate over keywords, if keyword is not found,
// no need to check for the others. We want all or nothing.
for ($i = 0; $i < count($andSearch) && $found; $i++) {
$found = strpos($content, $andSearch[$i]) !== false;
$foundPositions[] = ['start' => $found, 'end' => $found + mb_strlen($search[$i])];
}
}
// Exclude terms.
for ($i = 0; $i < count($excludeSearch) && $found; $i++) {
for ($i = 0; $i < count($excludeSearch) && $found !== false; $i++) {
$found = strpos($content, $excludeSearch[$i]) === false;
}
if ($found) {
if ($found !== false) {
$link->addAdditionalContentEntry(
'search_highlight',
$this->postProcessFoundPositions($lengths, $foundPositions)
);
$filtered[$id] = $link;
}
}
@ -477,4 +482,74 @@ public static function tagsStrToArray(string $tags, bool $casesensitive): array
return preg_split('/\s+/', $tagsOut, -1, PREG_SPLIT_NO_EMPTY);
}
/**
* This method finalize the content of the foundPositions array,
* by associated all search results to their associated bookmark field,
* making sure that there is no overlapping results, etc.
*
* @param array $fieldLengths Start and end positions of every bookmark fields in the aggregated bookmark content.
* @param array $foundPositions Positions where the search results were found in the aggregated content.
*
* @return array Updated $foundPositions, by bookmark field.
*/
protected function postProcessFoundPositions(array $fieldLengths, array $foundPositions): array
{
// Sort results by starting position ASC.
usort($foundPositions, function (array $entryA, array $entryB): int {
return $entryA['start'] > $entryB['start'] ? 1 : -1;
});
$out = [];
$currentMax = -1;
foreach ($foundPositions as $foundPosition) {
// we do not allow overlapping highlights
if ($foundPosition['start'] < $currentMax) {
continue;
}
$currentMax = $foundPosition['end'];
foreach ($fieldLengths as $part => $length) {
if ($foundPosition['start'] < $length['start'] || $foundPosition['start'] > $length['end']) {
continue;
}
$out[$part][] = [
'start' => $foundPosition['start'] - $length['start'],
'end' => $foundPosition['end'] - $length['start'],
];
break;
}
}
return $out;
}
/**
* Concatenate link fields to search across fields. Adds a '\' separator for exact search terms.
* Also populate $length array with starting and ending positions of every bookmark field
* inside concatenated content.
*
* @param Bookmark $link
* @param array $lengths (by reference)
*
* @return string Lowercase concatenated fields content.
*/
protected function buildFullTextSearchableLink(Bookmark $link, array &$lengths): string
{
$content = mb_convert_case($link->getTitle(), MB_CASE_LOWER, 'UTF-8') .'\\';
$content .= mb_convert_case($link->getDescription(), MB_CASE_LOWER, 'UTF-8') .'\\';
$content .= mb_convert_case($link->getUrl(), MB_CASE_LOWER, 'UTF-8') .'\\';
$content .= mb_convert_case($link->getTagsString(), MB_CASE_LOWER, 'UTF-8') .'\\';
$lengths['title'] = ['start' => 0, 'end' => mb_strlen($link->getTitle())];
$nextField = $lengths['title']['end'] + 1;
$lengths['description'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getDescription())];
$nextField = $lengths['description']['end'] + 1;
$lengths['url'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getUrl())];
$nextField = $lengths['url']['end'] + 1;
$lengths['tags'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getTagsString())];
return $content;
}
}

View file

@ -12,10 +12,13 @@
*/
class BookmarkDefaultFormatter extends BookmarkFormatter
{
const SEARCH_HIGHLIGHT_OPEN = '|@@HIGHLIGHT';
const SEARCH_HIGHLIGHT_CLOSE = 'HIGHLIGHT@@|';
/**
* @inheritdoc
*/
public function formatTitle($bookmark)
protected function formatTitle($bookmark)
{
return escape($bookmark->getTitle());
}
@ -23,10 +26,28 @@ public function formatTitle($bookmark)
/**
* @inheritdoc
*/
public function formatDescription($bookmark)
protected function formatTitleHtml($bookmark)
{
$title = $this->tokenizeSearchHighlightField(
$bookmark->getTitle() ?? '',
$bookmark->getAdditionalContentEntry('search_highlight')['title'] ?? []
);
return $this->replaceTokens(escape($title));
}
/**
* @inheritdoc
*/
protected function formatDescription($bookmark)
{
$indexUrl = ! empty($this->contextData['index_url']) ? $this->contextData['index_url'] : '';
return format_description(escape($bookmark->getDescription()), $indexUrl);
$description = $this->tokenizeSearchHighlightField(
$bookmark->getDescription() ?? '',
$bookmark->getAdditionalContentEntry('search_highlight')['description'] ?? []
);
return $this->replaceTokens(format_description(escape($description), $indexUrl));
}
/**
@ -40,7 +61,27 @@ protected function formatTagList($bookmark)
/**
* @inheritdoc
*/
public function formatTagString($bookmark)
protected function formatTagListHtml($bookmark)
{
if (empty($bookmark->getAdditionalContentEntry('search_highlight')['tags'])) {
return $this->formatTagList($bookmark);
}
$tags = $this->tokenizeSearchHighlightField(
$bookmark->getTagsString(),
$bookmark->getAdditionalContentEntry('search_highlight')['tags']
);
$tags = $this->filterTagList(explode(' ', $tags));
$tags = escape($tags);
$tags = $this->replaceTokensArray($tags);
return $tags;
}
/**
* @inheritdoc
*/
protected function formatTagString($bookmark)
{
return implode(' ', $this->formatTagList($bookmark));
}
@ -48,7 +89,7 @@ public function formatTagString($bookmark)
/**
* @inheritdoc
*/
public function formatUrl($bookmark)
protected function formatUrl($bookmark)
{
if ($bookmark->isNote() && isset($this->contextData['index_url'])) {
return rtrim($this->contextData['index_url'], '/') . '/' . escape(ltrim($bookmark->getUrl(), '/'));
@ -77,6 +118,19 @@ protected function formatRealUrl($bookmark)
return escape($bookmark->getUrl());
}
/**
* @inheritdoc
*/
protected function formatUrlHtml($bookmark)
{
$url = $this->tokenizeSearchHighlightField(
$bookmark->getUrl() ?? '',
$bookmark->getAdditionalContentEntry('search_highlight')['url'] ?? []
);
return $this->replaceTokens(escape($url));
}
/**
* @inheritdoc
*/
@ -84,4 +138,72 @@ protected function formatThumbnail($bookmark)
{
return escape($bookmark->getThumbnail());
}
/**
* Insert search highlight token in provided field content based on a list of search result positions
*
* @param string $fieldContent
* @param array|null $positions List of of search results with 'start' and 'end' positions.
*
* @return string Updated $fieldContent.
*/
protected function tokenizeSearchHighlightField(string $fieldContent, ?array $positions): string
{
if (empty($positions)) {
return $fieldContent;
}
$insertedTokens = 0;
$tokenLength = strlen(static::SEARCH_HIGHLIGHT_OPEN);
foreach ($positions as $position) {
$position = [
'start' => $position['start'] + ($insertedTokens * $tokenLength),
'end' => $position['end'] + ($insertedTokens * $tokenLength),
];
$content = mb_substr($fieldContent, 0, $position['start']);
$content .= static::SEARCH_HIGHLIGHT_OPEN;
$content .= mb_substr($fieldContent, $position['start'], $position['end'] - $position['start']);
$content .= static::SEARCH_HIGHLIGHT_CLOSE;
$content .= mb_substr($fieldContent, $position['end']);
$fieldContent = $content;
$insertedTokens += 2;
}
return $fieldContent;
}
/**
* Replace search highlight tokens with HTML highlighted span.
*
* @param string $fieldContent
*
* @return string updated content.
*/
protected function replaceTokens(string $fieldContent): string
{
return str_replace(
[static::SEARCH_HIGHLIGHT_OPEN, static::SEARCH_HIGHLIGHT_CLOSE],
['<span class="search-highlight">', '</span>'],
$fieldContent
);
}
/**
* Apply replaceTokens to an array of content strings.
*
* @param string[] $fieldContents
*
* @return array
*/
protected function replaceTokensArray(array $fieldContents): array
{
foreach ($fieldContents as &$entry) {
$entry = $this->replaceTokens($entry);
}
return $fieldContents;
}
}

View file

@ -2,7 +2,7 @@
namespace Shaarli\Formatter;
use DateTime;
use DateTimeInterface;
use Shaarli\Bookmark\Bookmark;
use Shaarli\Config\ConfigManager;
@ -11,6 +11,29 @@
*
* Abstract class processing all bookmark attributes through methods designed to be overridden.
*
* List of available formatted fields:
* - id ID
* - shorturl Unique identifier, used in permalinks
* - url URL, can be altered in some way, e.g. passing through an HTTP reverse proxy
* - real_url (legacy) same as `url`
* - url_html URL to be displayed in HTML content (it can contain HTML tags)
* - title Title
* - title_html Title to be displayed in HTML content (it can contain HTML tags)
* - description Description content. It most likely contains HTML tags
* - thumbnail Thumbnail: path to local cache file, false if there is none, null if hasn't been retrieved
* - taglist List of tags (array)
* - taglist_urlencoded List of tags (array) URL encoded: it must be used to create a link to a URL containing a tag
* - taglist_html List of tags (array) to be displayed in HTML content (it can contain HTML tags)
* - tags Tags separated by a single whitespace
* - tags_urlencoded Tags separated by a single whitespace, URL encoded: must be used to create a link
* - sticky Is sticky (bool)
* - private Is private (bool)
* - class Additional CSS class
* - created Creation DateTime
* - updated Last edit DateTime
* - timestamp Creation timestamp
* - updated_timestamp Last edit timestamp
*
* @package Shaarli\Formatter
*/
abstract class BookmarkFormatter
@ -55,13 +78,16 @@ public function format($bookmark)
$out['shorturl'] = $this->formatShortUrl($bookmark);
$out['url'] = $this->formatUrl($bookmark);
$out['real_url'] = $this->formatRealUrl($bookmark);
$out['url_html'] = $this->formatUrlHtml($bookmark);
$out['title'] = $this->formatTitle($bookmark);
$out['title_html'] = $this->formatTitleHtml($bookmark);
$out['description'] = $this->formatDescription($bookmark);
$out['thumbnail'] = $this->formatThumbnail($bookmark);
$out['urlencoded_taglist'] = $this->formatUrlEncodedTagList($bookmark);
$out['taglist'] = $this->formatTagList($bookmark);
$out['urlencoded_tags'] = $this->formatUrlEncodedTagString($bookmark);
$out['taglist_urlencoded'] = $this->formatTagListUrlEncoded($bookmark);
$out['taglist_html'] = $this->formatTagListHtml($bookmark);
$out['tags'] = $this->formatTagString($bookmark);
$out['tags_urlencoded'] = $this->formatTagStringUrlEncoded($bookmark);
$out['sticky'] = $bookmark->isSticky();
$out['private'] = $bookmark->isPrivate();
$out['class'] = $this->formatClass($bookmark);
@ -69,6 +95,7 @@ public function format($bookmark)
$out['updated'] = $this->formatUpdated($bookmark);
$out['timestamp'] = $this->formatCreatedTimestamp($bookmark);
$out['updated_timestamp'] = $this->formatUpdatedTimestamp($bookmark);
return $out;
}
@ -135,6 +162,18 @@ protected function formatRealUrl($bookmark)
return $this->formatUrl($bookmark);
}
/**
* Format Url Html: to be displayed in HTML content, it can contains HTML tags.
*
* @param Bookmark $bookmark instance
*
* @return string formatted Url HTML
*/
protected function formatUrlHtml($bookmark)
{
return $this->formatUrl($bookmark);
}
/**
* Format Title
*
@ -147,6 +186,18 @@ protected function formatTitle($bookmark)
return $bookmark->getTitle();
}
/**
* Format Title HTML: to be displayed in HTML content, it can contains HTML tags.
*
* @param Bookmark $bookmark instance
*
* @return string formatted Title
*/
protected function formatTitleHtml($bookmark)
{
return $bookmark->getTitle();
}
/**
* Format Description
*
@ -190,11 +241,23 @@ protected function formatTagList($bookmark)
*
* @return array formatted Tags
*/
protected function formatUrlEncodedTagList($bookmark)
protected function formatTagListUrlEncoded($bookmark)
{
return array_map('urlencode', $this->filterTagList($bookmark->getTags()));
}
/**
* Format Tags HTML: to be displayed in HTML content, it can contains HTML tags.
*
* @param Bookmark $bookmark instance
*
* @return array formatted Tags
*/
protected function formatTagListHtml($bookmark)
{
return $this->formatTagList($bookmark);
}
/**
* Format TagString
*
@ -214,9 +277,9 @@ protected function formatTagString($bookmark)
*
* @return string formatted TagString
*/
protected function formatUrlEncodedTagString($bookmark)
protected function formatTagStringUrlEncoded($bookmark)
{
return implode(' ', $this->formatUrlEncodedTagList($bookmark));
return implode(' ', $this->formatTagListUrlEncoded($bookmark));
}
/**
@ -237,7 +300,7 @@ protected function formatClass($bookmark)
*
* @param Bookmark $bookmark instance
*
* @return DateTime instance
* @return DateTimeInterface instance
*/
protected function formatCreated(Bookmark $bookmark)
{
@ -249,7 +312,7 @@ protected function formatCreated(Bookmark $bookmark)
*
* @param Bookmark $bookmark instance
*
* @return DateTime instance
* @return DateTimeInterface instance
*/
protected function formatUpdated(Bookmark $bookmark)
{

View file

@ -56,7 +56,10 @@ public function formatDescription($bookmark)
return parent::formatDescription($bookmark);
}
$processedDescription = $bookmark->getDescription();
$processedDescription = $this->tokenizeSearchHighlightField(
$bookmark->getDescription() ?? '',
$bookmark->getAdditionalContentEntry('search_highlight')['description'] ?? []
);
$processedDescription = $this->filterProtocols($processedDescription);
$processedDescription = $this->formatHashTags($processedDescription);
$processedDescription = $this->reverseEscapedHtml($processedDescription);
@ -65,6 +68,7 @@ public function formatDescription($bookmark)
->setBreaksEnabled(true)
->text($processedDescription);
$processedDescription = $this->sanitizeHtml($processedDescription);
$processedDescription = $this->replaceTokens($processedDescription);
if (!empty($processedDescription)) {
$processedDescription = '<div class="markdown">'. $processedDescription . '</div>';

View file

@ -671,6 +671,10 @@ body,
content: '';
}
}
.search-highlight {
background-color: yellow;
}
}
.linklist-item-buttons {

View file

@ -165,7 +165,7 @@ <h2>
<i class="fa fa-sticky-note" aria-hidden="true"></i>
{/if}
<span class="linklist-link">{$value.title}</span>
<span class="linklist-link">{$value.title_html}</span>
</a>
</h2>
</div>
@ -183,7 +183,7 @@ <h2>
{$tag_counter=count($value.taglist)}
{loop="value.taglist"}
<span class="label label-tag" title="{$strAddTag}">
<a href="{$base_path}/add-tag/{$value1.urlencoded_taglist.$key2}">{$value}</a>
<a href="{$base_path}/add-tag/{$value1.taglist_urlencoded.$key2}">{$value1.taglist_html.$key2}</a>
</span>
{if="$tag_counter - 1 != $counter"}&middot;{/if}
{/loop}
@ -251,7 +251,7 @@ <h2>
{ignore}do not add space or line break between these div - Firefox issue{/ignore}
class="linklist-item-infos-url pure-u-lg-5-12 pure-u-1">
<a href="{$value.real_url}" aria-label="{$value.title}" title="{$value.title}">
<i class="fa fa-link" aria-hidden="true"></i> {$value.url}
<i class="fa fa-link" aria-hidden="true"></i> {$value.url_html}
</a>
<div class="linklist-item-buttons pure-u-0 pure-u-lg-visible">
<a href="#" aria-label="{$strFold}" title="{$strFold}" class="fold-button"><i class="fa fa-chevron-up" aria-hidden="true"></i></a>