From bedd176a5406003631da42366736fd5ebae29135 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Mon, 1 Feb 2016 20:33:58 +0100 Subject: [PATCH 1/2] Improved search: combine AND, exact terms and exclude search. --- application/LinkFilter.php | 60 ++++++++++++++++++++------------- tests/LinkDBTest.php | 1 + tests/LinkFilterTest.php | 40 +++++++++++++++++++--- tests/utils/ReferenceLinkDB.php | 11 +++++- 4 files changed, 83 insertions(+), 29 deletions(-) diff --git a/application/LinkFilter.php b/application/LinkFilter.php index ceb47d16..e2ef94ea 100644 --- a/application/LinkFilter.php +++ b/application/LinkFilter.php @@ -120,7 +120,9 @@ private function filterSmallHash($smallHash) * * Searches: * - in the URLs, title and description; - * - are case-insensitive. + * - are case-insensitive; + * - terms surrounded by quotes " are exact terms search. + * - terms starting with a dash - are excluded (except exact terms). * * Example: * print_r($mydb->filterFulltext('hollandais')); @@ -137,18 +139,28 @@ private function filterSmallHash($smallHash) private function filterFulltext($searchterms, $privateonly = false) { $search = mb_convert_case(html_entity_decode($searchterms), MB_CASE_LOWER, 'UTF-8'); - $explodedSearch = explode(' ', trim($search)); - $keys = array('title', 'description', 'url', 'tags'); - $found = true; - $searchExactPhrase = false; + $exactRegex = '/"([^"]+)"/'; + // Retrieve exact search terms. + preg_match_all($exactRegex, $search, $exactSearch); + $exactSearch = array_values(array_filter($exactSearch[1])); - // Check if we're using double-quotes to search for the exact string - if ($search[0] == '"' && $search[strlen($search) - 1] == '"') { - $searchExactPhrase = true; - - // Remove the double-quotes as they are not what we search for - $search = substr($search, 1, -1); + // Remove exact search terms to get AND terms search. + $explodedSearchAnd = explode(' ', trim(preg_replace($exactRegex, '', $search))); + $explodedSearchAnd = array_values(array_filter($explodedSearchAnd)); + + // Filter excluding terms and update andSearch. + $excludeSearch = array(); + $andSearch = array(); + foreach ($explodedSearchAnd as $needle) { + if ($needle[0] == '-' && strlen($needle) > 1) { + $excludeSearch[] = substr($needle, 1); + } else { + $andSearch[] = $needle; + } } + + $keys = array('title', 'description', 'url', 'tags'); + // Iterate over every stored link. foreach ($this->links as $link) { @@ -162,22 +174,22 @@ private function filterFulltext($searchterms, $privateonly = false) // Be optimistic $found = true; - // FIXME: Find a better word for where you're searching in $haystack = mb_convert_case($link[$key], MB_CASE_LOWER, 'UTF-8'); - // When searching for the phrase, check if it's in the haystack... - if ( $searchExactPhrase && strpos($haystack, $search) !== false) { - break; + // First, we look for exact term search + for ($i = 0; $i < count($exactSearch) && $found; $i++) { + $found = strpos($haystack, $exactSearch[$i]) !== false; } - else { - // Iterate over keywords, if keyword is not found, - // no need to check for the others. We want all or nothing. - foreach($explodedSearch as $keyword) { - if(strpos($haystack, $keyword) === false) { - $found = false; - break; - } - } + + // Iterate over keywords, if keyword is not found, + // no need to check for the others. We want all or nothing. + for ($i = 0; $i < count($andSearch) && $found; $i++) { + $found = strpos($haystack, $andSearch[$i]) !== false; + } + + // Exclude terms. + for ($i = 0; $i < count($excludeSearch) && $found; $i++) { + $found = strpos($haystack, $excludeSearch[$i]) === false; } // One of the fields of the link matches, no need to check the other. diff --git a/tests/LinkDBTest.php b/tests/LinkDBTest.php index 765f771e..78f42e56 100644 --- a/tests/LinkDBTest.php +++ b/tests/LinkDBTest.php @@ -278,6 +278,7 @@ public function testAllTags() 'stallman' => 1, 'free' => 1, '-exclude' => 1, + 'stuff' => 2, ), self::$publicLinkDB->allTags() ); diff --git a/tests/LinkFilterTest.php b/tests/LinkFilterTest.php index 164af0d4..4d754d25 100644 --- a/tests/LinkFilterTest.php +++ b/tests/LinkFilterTest.php @@ -27,7 +27,7 @@ public static function setUpBeforeClass() public function testFilter() { $this->assertEquals( - 6, + 7, count(self::$linkFilter->filter('', '')) ); @@ -222,7 +222,7 @@ public function testFilterFullTextDescription() ); $this->assertEquals( - 2, + 3, count(self::$linkFilter->filter(LinkFilter::$FILTER_TEXT, '"free software"')) ); } @@ -250,11 +250,43 @@ public function testFilterFullTextTags() public function testFilterFullTextMixed() { $this->assertEquals( - 2, + 3, count(self::$linkFilter->filter(LinkFilter::$FILTER_TEXT, 'free software')) ); } + /** + * Full-text search - test exclusion with '-'. + */ + public function testExcludeSearch() + { + $this->assertEquals( + 1, + count(self::$linkFilter->filter(LinkFilter::$FILTER_TEXT, 'free -software')) + ); + + $this->assertEquals( + 7, + count(self::$linkFilter->filter(LinkFilter::$FILTER_TEXT, '-software')) + ); + } + + /** + * Full-text search - test AND, exact terms and exclusion combined. + */ + public function testMultiSearch() + { + $this->assertEquals( + 2, + count(self::$linkFilter->filter(LinkFilter::$FILTER_TEXT, '"Free Software " stallman "read this"')) + ); + + $this->assertEquals( + 1, + count(self::$linkFilter->filter(LinkFilter::$FILTER_TEXT, '"free software " stallman "read this" -beard')) + ); + } + /** * Tag search with exclusion. */ @@ -266,7 +298,7 @@ public function testTagFilterWithExclusion() ); $this->assertEquals( - 5, + 6, count(self::$linkFilter->filter(LinkFilter::$FILTER_TAG, '-free')) ); } diff --git a/tests/utils/ReferenceLinkDB.php b/tests/utils/ReferenceLinkDB.php index da3e8c65..b64b58bf 100644 --- a/tests/utils/ReferenceLinkDB.php +++ b/tests/utils/ReferenceLinkDB.php @@ -16,12 +16,21 @@ function __construct() $this->addLink( 'Free as in Freedom 2.0', 'https://static.fsf.org/nosvn/faif-2.0.pdf', - 'Richard Stallman and the Free Software Revolution', + 'Richard Stallman and the Free Software Revolution. Read this.', 0, '20150310_114633', 'free gnu software stallman -exclude' ); + $this->addLink( + 'Note:', + 'local', + 'Stallman has a beard and is part of the Free Software Foundation (or not). Seriously, read this.', + 0, + '20150310_114651', + '' + ); + $this->addLink( 'MediaGoblin', 'http://mediagoblin.org/', From 522b278b03280ed809025ebbeb3eac284b68bf81 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Tue, 2 Feb 2016 19:42:48 +0100 Subject: [PATCH 2/2] Support text search across link fields. --- application/LinkFilter.php | 52 ++++++++++++++++----------------- tests/LinkDBTest.php | 1 + tests/LinkFilterTest.php | 51 ++++++++++++++++++++++++++++---- tests/utils/ReferenceLinkDB.php | 8 ++--- 4 files changed, 75 insertions(+), 37 deletions(-) diff --git a/application/LinkFilter.php b/application/LinkFilter.php index e2ef94ea..17594e8f 100644 --- a/application/LinkFilter.php +++ b/application/LinkFilter.php @@ -138,6 +138,7 @@ private function filterSmallHash($smallHash) */ private function filterFulltext($searchterms, $privateonly = false) { + $filtered = array(); $search = mb_convert_case(html_entity_decode($searchterms), MB_CASE_LOWER, 'UTF-8'); $exactRegex = '/"([^"]+)"/'; // Retrieve exact search terms. @@ -169,35 +170,32 @@ private function filterFulltext($searchterms, $privateonly = false) continue; } - // Iterate over searchable link fields. + // Concatenate link fields to search across fields. + // Adds a '\' separator for exact search terms. + $content = ''; foreach ($keys as $key) { - // Be optimistic - $found = true; - - $haystack = mb_convert_case($link[$key], MB_CASE_LOWER, 'UTF-8'); - - // First, we look for exact term search - for ($i = 0; $i < count($exactSearch) && $found; $i++) { - $found = strpos($haystack, $exactSearch[$i]) !== false; - } - - // Iterate over keywords, if keyword is not found, - // no need to check for the others. We want all or nothing. - for ($i = 0; $i < count($andSearch) && $found; $i++) { - $found = strpos($haystack, $andSearch[$i]) !== false; - } - - // Exclude terms. - for ($i = 0; $i < count($excludeSearch) && $found; $i++) { - $found = strpos($haystack, $excludeSearch[$i]) === false; - } - - // One of the fields of the link matches, no need to check the other. - if ($found) { - break; - } + $content .= mb_convert_case($link[$key], MB_CASE_LOWER, 'UTF-8') . '\\'; } - + + // Be optimistic + $found = true; + + // First, we look for exact term search + for ($i = 0; $i < count($exactSearch) && $found; $i++) { + $found = strpos($content, $exactSearch[$i]) !== false; + } + + // Iterate over keywords, if keyword is not found, + // no need to check for the others. We want all or nothing. + for ($i = 0; $i < count($andSearch) && $found; $i++) { + $found = strpos($content, $andSearch[$i]) !== false; + } + + // Exclude terms. + for ($i = 0; $i < count($excludeSearch) && $found; $i++) { + $found = strpos($content, $excludeSearch[$i]) === false; + } + if ($found) { $filtered[$link['linkdate']] = $link; } diff --git a/tests/LinkDBTest.php b/tests/LinkDBTest.php index 78f42e56..b6a273b3 100644 --- a/tests/LinkDBTest.php +++ b/tests/LinkDBTest.php @@ -298,6 +298,7 @@ public function testAllTags() 'w3c' => 1, 'css' => 1, 'Mercurial' => 1, + 'stuff' => 2, '-exclude' => 1, '.hidden' => 1, ), diff --git a/tests/LinkFilterTest.php b/tests/LinkFilterTest.php index 4d754d25..31fd4cf4 100644 --- a/tests/LinkFilterTest.php +++ b/tests/LinkFilterTest.php @@ -164,6 +164,17 @@ public function testFilterUnknownSmallHash() ); } + /** + * Full-text search - no result found. + */ + public function testFilterFullTextNoResult() + { + $this->assertEquals( + 0, + count(self::$linkFilter->filter(LinkFilter::$FILTER_TEXT, 'azertyuiop')) + ); + } + /** * Full-text search - result from a link's URL */ @@ -262,28 +273,56 @@ public function testExcludeSearch() { $this->assertEquals( 1, - count(self::$linkFilter->filter(LinkFilter::$FILTER_TEXT, 'free -software')) + count(self::$linkFilter->filter(LinkFilter::$FILTER_TEXT, 'free -gnu')) ); $this->assertEquals( - 7, - count(self::$linkFilter->filter(LinkFilter::$FILTER_TEXT, '-software')) + 6, + count(self::$linkFilter->filter(LinkFilter::$FILTER_TEXT, '-revolution')) ); } /** - * Full-text search - test AND, exact terms and exclusion combined. + * Full-text search - test AND, exact terms and exclusion combined, across fields. */ public function testMultiSearch() { $this->assertEquals( 2, - count(self::$linkFilter->filter(LinkFilter::$FILTER_TEXT, '"Free Software " stallman "read this"')) + count(self::$linkFilter->filter( + LinkFilter::$FILTER_TEXT, + '"Free Software " stallman "read this" @website stuff' + )) ); $this->assertEquals( 1, - count(self::$linkFilter->filter(LinkFilter::$FILTER_TEXT, '"free software " stallman "read this" -beard')) + count(self::$linkFilter->filter( + LinkFilter::$FILTER_TEXT, + '"free software " stallman "read this" -beard @website stuff' + )) + ); + } + + /** + * Full-text search - make sure that exact search won't work across fields. + */ + public function testSearchExactTermMultiFieldsKo() + { + $this->assertEquals( + 0, + count(self::$linkFilter->filter( + LinkFilter::$FILTER_TEXT, + '"designer naming"' + )) + ); + + $this->assertEquals( + 0, + count(self::$linkFilter->filter( + LinkFilter::$FILTER_TEXT, + '"designernaming"' + )) ); } diff --git a/tests/utils/ReferenceLinkDB.php b/tests/utils/ReferenceLinkDB.php index b64b58bf..61faef05 100644 --- a/tests/utils/ReferenceLinkDB.php +++ b/tests/utils/ReferenceLinkDB.php @@ -14,21 +14,21 @@ class ReferenceLinkDB function __construct() { $this->addLink( - 'Free as in Freedom 2.0', + 'Free as in Freedom 2.0 @website', 'https://static.fsf.org/nosvn/faif-2.0.pdf', 'Richard Stallman and the Free Software Revolution. Read this.', 0, '20150310_114633', - 'free gnu software stallman -exclude' + 'free gnu software stallman -exclude stuff' ); $this->addLink( - 'Note:', + 'Link title: @website', 'local', 'Stallman has a beard and is part of the Free Software Foundation (or not). Seriously, read this.', 0, '20150310_114651', - '' + 'stuff' ); $this->addLink(