Fix an issue truncating extracted metadata content

Previous regex forced the selection to stop at either the first single or double quote found, regardless of the opening quote. Using '\1', we're sure to wait for the proper quote before stopping the capture.
This commit is contained in:
ArthurHoaro 2020-11-08 13:54:39 +01:00
parent 8c5f6c786d
commit 00d3dd91ef
2 changed files with 34 additions and 4 deletions

View file

@ -68,16 +68,16 @@ function html_extract_tag($tag, $html)
$properties = implode('|', $propertiesKey); $properties = implode('|', $propertiesKey);
// We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"' // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
$orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]'; $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
// Try to retrieve OpenGraph image. // Try to retrieve OpenGraph tag.
$ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#'; $ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=(["\'])([^\1]*?)\1.*?>#';
// If the attributes are not in the order property => content (e.g. Github) // If the attributes are not in the order property => content (e.g. Github)
// New regex to keep this readable... more or less. // New regex to keep this readable... more or less.
$ogRegexReverse = '#<meta[^>]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#'; $ogRegexReverse = '#<meta[^>]+content=(["\'])([^\1]*?)\1[^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#';
if (preg_match($ogRegex, $html, $matches) > 0 if (preg_match($ogRegex, $html, $matches) > 0
|| preg_match($ogRegexReverse, $html, $matches) > 0 || preg_match($ogRegexReverse, $html, $matches) > 0
) { ) {
return $matches[1]; return $matches[2];
} }
return false; return false;

View file

@ -168,6 +168,36 @@ public function testHtmlExtractExistentNameTag()
$this->assertEquals($description, html_extract_tag('description', $html)); $this->assertEquals($description, html_extract_tag('description', $html));
} }
/**
* Test html_extract_tag() with double quoted content containing single quote, and the opposite.
*/
public function testHtmlExtractExistentNameTagWithMixedQuotes(): void
{
$description = 'Bob and Alice share M&M\'s.';
$html = '<meta property="og:description" content="' . $description . '">';
$this->assertEquals($description, html_extract_tag('description', $html));
$html = '<meta tag1="content1" property="og:unrelated1 og:description og:unrelated2" '.
'tag2="content2" content="' . $description . '" tag3="content3">';
$this->assertEquals($description, html_extract_tag('description', $html));
$html = '<meta property="og:description" name="description" content="' . $description . '">';
$this->assertEquals($description, html_extract_tag('description', $html));
$description = 'Bob and Alice share "cookies".';
$html = '<meta property="og:description" content=\'' . $description . '\'>';
$this->assertEquals($description, html_extract_tag('description', $html));
$html = '<meta tag1="content1" property="og:unrelated1 og:description og:unrelated2" '.
'tag2="content2" content=\'' . $description . '\' tag3="content3">';
$this->assertEquals($description, html_extract_tag('description', $html));
$html = '<meta property="og:description" name="description" content=\'' . $description . '\'>';
$this->assertEquals($description, html_extract_tag('description', $html));
}
/** /**
* Test html_extract_tag() when the tag <meta name= is not found. * Test html_extract_tag() when the tag <meta name= is not found.
*/ */