Fix metadata extract regex (2)
Reference: https://stackoverflow.com/questions/8055727/negating-a-backreference-in-regular-expressions Fixes #1656
This commit is contained in:
parent
e4b8330e45
commit
88a8e284b2
2 changed files with 14 additions and 2 deletions
|
@ -68,11 +68,13 @@ function html_extract_tag($tag, $html)
|
||||||
$properties = implode('|', $propertiesKey);
|
$properties = implode('|', $propertiesKey);
|
||||||
// We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
|
// We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
|
||||||
$orCondition = '["\']?(?:og:)?' . $tag . '["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
|
$orCondition = '["\']?(?:og:)?' . $tag . '["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
|
||||||
|
// Support quotes in double quoted content, and the other way around
|
||||||
|
$content = 'content=(["\'])((?:(?!\1).)*)\1';
|
||||||
// Try to retrieve OpenGraph tag.
|
// Try to retrieve OpenGraph tag.
|
||||||
$ogRegex = '#<meta[^>]+(?:' . $properties . ')=(?:' . $orCondition . ')[^>]*content=(["\'])([^\1]*?)\1.*?>#';
|
$ogRegex = '#<meta[^>]+(?:' . $properties . ')=(?:' . $orCondition . ')[^>]*' . $content . '.*?>#';
|
||||||
// If the attributes are not in the order property => content (e.g. Github)
|
// If the attributes are not in the order property => content (e.g. Github)
|
||||||
// New regex to keep this readable... more or less.
|
// New regex to keep this readable... more or less.
|
||||||
$ogRegexReverse = '#<meta[^>]+content=(["\'])([^\1]*?)\1[^>]+(?:' . $properties . ')=(?:' . $orCondition . ').*?>#';
|
$ogRegexReverse = '#<meta[^>]+' . $content . '[^>]+(?:' . $properties . ')=(?:' . $orCondition . ').*?>#';
|
||||||
|
|
||||||
if (
|
if (
|
||||||
preg_match($ogRegex, $html, $matches) > 0
|
preg_match($ogRegex, $html, $matches) > 0
|
||||||
|
|
|
@ -245,6 +245,16 @@ public function testHtmlExtractNonExistentOgTag()
|
||||||
$this->assertFalse(html_extract_tag('description', $html));
|
$this->assertFalse(html_extract_tag('description', $html));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function testHtmlExtractDescriptionFromGoogleRealCase(): void
|
||||||
|
{
|
||||||
|
$html = 'id="gsr"><meta content="Fêtes de fin d\'année" property="twitter:title"><meta '.
|
||||||
|
'content="Bonnes fêtes de fin d\'année ! #GoogleDoodle" property="twitter:description">'.
|
||||||
|
'<meta content="Bonnes fêtes de fin d\'année ! #GoogleDoodle" property="og:description">'.
|
||||||
|
'<meta content="summary_large_image" property="twitter:card"><meta co'
|
||||||
|
;
|
||||||
|
$this->assertSame('Bonnes fêtes de fin d\'année ! #GoogleDoodle', html_extract_tag('description', $html));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the header callback with valid value
|
* Test the header callback with valid value
|
||||||
*/
|
*/
|
||||||
|
|
Loading…
Reference in a new issue