Merge pull request #1631 from ArthurHoaro/fix/html-extract-quote-fix
Fix an issue truncating extracted metadata content
This commit is contained in:
commit
8d8fa898ab
2 changed files with 34 additions and 4 deletions
|
@ -68,16 +68,16 @@ function html_extract_tag($tag, $html)
|
||||||
$properties = implode('|', $propertiesKey);
|
$properties = implode('|', $propertiesKey);
|
||||||
// We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
|
// We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
|
||||||
$orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
|
$orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
|
||||||
// Try to retrieve OpenGraph image.
|
// Try to retrieve OpenGraph tag.
|
||||||
$ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#';
|
$ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=(["\'])([^\1]*?)\1.*?>#';
|
||||||
// If the attributes are not in the order property => content (e.g. Github)
|
// If the attributes are not in the order property => content (e.g. Github)
|
||||||
// New regex to keep this readable... more or less.
|
// New regex to keep this readable... more or less.
|
||||||
$ogRegexReverse = '#<meta[^>]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#';
|
$ogRegexReverse = '#<meta[^>]+content=(["\'])([^\1]*?)\1[^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#';
|
||||||
|
|
||||||
if (preg_match($ogRegex, $html, $matches) > 0
|
if (preg_match($ogRegex, $html, $matches) > 0
|
||||||
|| preg_match($ogRegexReverse, $html, $matches) > 0
|
|| preg_match($ogRegexReverse, $html, $matches) > 0
|
||||||
) {
|
) {
|
||||||
return $matches[1];
|
return $matches[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -168,6 +168,36 @@ public function testHtmlExtractExistentNameTag()
|
||||||
$this->assertEquals($description, html_extract_tag('description', $html));
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test html_extract_tag() with double quoted content containing single quote, and the opposite.
|
||||||
|
*/
|
||||||
|
public function testHtmlExtractExistentNameTagWithMixedQuotes(): void
|
||||||
|
{
|
||||||
|
$description = 'Bob and Alice share M&M\'s.';
|
||||||
|
|
||||||
|
$html = '<meta property="og:description" content="' . $description . '">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
$html = '<meta tag1="content1" property="og:unrelated1 og:description og:unrelated2" '.
|
||||||
|
'tag2="content2" content="' . $description . '" tag3="content3">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
$html = '<meta property="og:description" name="description" content="' . $description . '">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
$description = 'Bob and Alice share "cookies".';
|
||||||
|
|
||||||
|
$html = '<meta property="og:description" content=\'' . $description . '\'>';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
$html = '<meta tag1="content1" property="og:unrelated1 og:description og:unrelated2" '.
|
||||||
|
'tag2="content2" content=\'' . $description . '\' tag3="content3">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
$html = '<meta property="og:description" name="description" content=\'' . $description . '\'>';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test html_extract_tag() when the tag <meta name= is not found.
|
* Test html_extract_tag() when the tag <meta name= is not found.
|
||||||
*/
|
*/
|
||||||
|
|
Loading…
Reference in a new issue