From 2cd0509b503332b1989f06da45d569d4d2929be5 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Thu, 3 Sep 2020 17:46:26 +0200 Subject: [PATCH] Improve regex to extract HTML metadata (title, description, etc.) Also added a bunch of tests to cover more use cases. Fixes #1375 --- application/bookmark/LinkUtils.php | 6 +- tests/bookmark/LinkUtilsTest.php | 89 ++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 2 deletions(-) diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index 68914fca..03e1b82a 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php @@ -66,11 +66,13 @@ function html_extract_tag($tag, $html) { $propertiesKey = ['property', 'name', 'itemprop']; $properties = implode('|', $propertiesKey); + // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"' + $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]'; // Try to retrieve OpenGraph image. - $ogRegex = '#]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#'; + $ogRegex = '#]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#'; // If the attributes are not in the order property => content (e.g. Github) // New regex to keep this readable... more or less. - $ogRegexReverse = '#]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#'; + $ogRegexReverse = '#]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#'; if (preg_match($ogRegex, $html, $matches) > 0 || preg_match($ogRegexReverse, $html, $matches) > 0 diff --git a/tests/bookmark/LinkUtilsTest.php b/tests/bookmark/LinkUtilsTest.php index 7d4a7b89..cc7819bc 100644 --- a/tests/bookmark/LinkUtilsTest.php +++ b/tests/bookmark/LinkUtilsTest.php @@ -81,8 +81,78 @@ public function testHtmlExtractNonExistentCharset() public function testHtmlExtractExistentNameTag() { $description = 'Bob and Alice share cookies.'; + + // Simple one line $html = 'stuff2'; $this->assertEquals($description, html_extract_tag('description', $html)); + + // Simple OpenGraph + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // Simple reversed OpenGraph + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // ItemProp OpenGraph + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph without quotes + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph reversed without quotes + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph with noise + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph reversed with noise + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph multiple properties start + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph multiple properties end + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph multiple properties both end + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph multiple properties both end with noise + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph reversed multiple properties start + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph reversed multiple properties end + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph reversed multiple properties both end + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph reversed multiple properties both end with noise + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // Suggestion from #1375 + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); } /** @@ -92,6 +162,25 @@ public function testHtmlExtractNonExistentNameTag() { $html = 'stuff2'; $this->assertFalse(html_extract_tag('description', $html)); + + // Partial meta tag + $html = ''; + $this->assertFalse(html_extract_tag('description', $html)); + + $html = ''; + $this->assertFalse(html_extract_tag('description', $html)); + + $html = ''; + $this->assertFalse(html_extract_tag('description', $html)); + + $html = ''; + $this->assertFalse(html_extract_tag('description', $html)); + + $html = ''; + $this->assertFalse(html_extract_tag('description', $html)); + + $html = ''; + $this->assertFalse(html_extract_tag('description', $html)); } /**