From 00d3dd91ef42df13eeafbcc54dcebe3238e322c6 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Sun, 8 Nov 2020 13:54:39 +0100 Subject: [PATCH] Fix an issue truncating extracted metadata content Previous regex forced the selection to stop at either the first single or double quote found, regardless of the opening quote. Using '\1', we're sure to wait for the proper quote before stopping the capture. --- application/bookmark/LinkUtils.php | 8 ++++---- tests/bookmark/LinkUtilsTest.php | 30 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index 17c37979..a74fda57 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php @@ -68,16 +68,16 @@ function html_extract_tag($tag, $html) $properties = implode('|', $propertiesKey); // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"' $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]'; - // Try to retrieve OpenGraph image. - $ogRegex = '#]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#'; + // Try to retrieve OpenGraph tag. + $ogRegex = '#]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=(["\'])([^\1]*?)\1.*?>#'; // If the attributes are not in the order property => content (e.g. Github) // New regex to keep this readable... more or less. - $ogRegexReverse = '#]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#'; + $ogRegexReverse = '#]+content=(["\'])([^\1]*?)\1[^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#'; if (preg_match($ogRegex, $html, $matches) > 0 || preg_match($ogRegexReverse, $html, $matches) > 0 ) { - return $matches[1]; + return $matches[2]; } return false; diff --git a/tests/bookmark/LinkUtilsTest.php b/tests/bookmark/LinkUtilsTest.php index 3321242f..9bddf84b 100644 --- a/tests/bookmark/LinkUtilsTest.php +++ b/tests/bookmark/LinkUtilsTest.php @@ -168,6 +168,36 @@ public function testHtmlExtractExistentNameTag() $this->assertEquals($description, html_extract_tag('description', $html)); } + /** + * Test html_extract_tag() with double quoted content containing single quote, and the opposite. + */ + public function testHtmlExtractExistentNameTagWithMixedQuotes(): void + { + $description = 'Bob and Alice share M&M\'s.'; + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $description = 'Bob and Alice share "cookies".'; + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + } + /** * Test html_extract_tag() when the tag