Improve regex to extract HTML metadata (title, description, etc.)
Also added a bunch of tests to cover more use cases. Fixes #1375
This commit is contained in:
parent
21163a3329
commit
2cd0509b50
2 changed files with 93 additions and 2 deletions
|
@ -66,11 +66,13 @@ function html_extract_tag($tag, $html)
|
||||||
{
|
{
|
||||||
$propertiesKey = ['property', 'name', 'itemprop'];
|
$propertiesKey = ['property', 'name', 'itemprop'];
|
||||||
$properties = implode('|', $propertiesKey);
|
$properties = implode('|', $propertiesKey);
|
||||||
|
// We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
|
||||||
|
$orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
|
||||||
// Try to retrieve OpenGraph image.
|
// Try to retrieve OpenGraph image.
|
||||||
$ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#';
|
$ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#';
|
||||||
// If the attributes are not in the order property => content (e.g. Github)
|
// If the attributes are not in the order property => content (e.g. Github)
|
||||||
// New regex to keep this readable... more or less.
|
// New regex to keep this readable... more or less.
|
||||||
$ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#';
|
$ogRegexReverse = '#<meta[^>]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#';
|
||||||
|
|
||||||
if (preg_match($ogRegex, $html, $matches) > 0
|
if (preg_match($ogRegex, $html, $matches) > 0
|
||||||
|| preg_match($ogRegexReverse, $html, $matches) > 0
|
|| preg_match($ogRegexReverse, $html, $matches) > 0
|
||||||
|
|
|
@ -81,8 +81,78 @@ public function testHtmlExtractNonExistentCharset()
|
||||||
public function testHtmlExtractExistentNameTag()
|
public function testHtmlExtractExistentNameTag()
|
||||||
{
|
{
|
||||||
$description = 'Bob and Alice share cookies.';
|
$description = 'Bob and Alice share cookies.';
|
||||||
|
|
||||||
|
// Simple one line
|
||||||
$html = '<html><meta>stuff2</meta><meta name="description" content="' . $description . '"/></html>';
|
$html = '<html><meta>stuff2</meta><meta name="description" content="' . $description . '"/></html>';
|
||||||
$this->assertEquals($description, html_extract_tag('description', $html));
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// Simple OpenGraph
|
||||||
|
$html = '<meta property="og:description" content="' . $description . '">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// Simple reversed OpenGraph
|
||||||
|
$html = '<meta content="' . $description . '" property="og:description">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// ItemProp OpenGraph
|
||||||
|
$html = '<meta itemprop="og:description" content="' . $description . '">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// OpenGraph without quotes
|
||||||
|
$html = '<meta property=og:description content="' . $description . '">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// OpenGraph reversed without quotes
|
||||||
|
$html = '<meta content="' . $description . '" property=og:description>';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// OpenGraph with noise
|
||||||
|
$html = '<meta tag1="content1" property="og:description" tag2="content2" content="' .
|
||||||
|
$description . '" tag3="content3">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// OpenGraph reversed with noise
|
||||||
|
$html = '<meta tag1="content1" content="' . $description . '" ' .
|
||||||
|
'tag3="content3" tag2="content2" property="og:description">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// OpenGraph multiple properties start
|
||||||
|
$html = '<meta property="unrelated og:description" content="' . $description . '">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// OpenGraph multiple properties end
|
||||||
|
$html = '<meta property="og:description unrelated" content="' . $description . '">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// OpenGraph multiple properties both end
|
||||||
|
$html = '<meta property="og:unrelated1 og:description og:unrelated2" content="' . $description . '">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// OpenGraph multiple properties both end with noise
|
||||||
|
$html = '<meta tag1="content1" property="og:unrelated1 og:description og:unrelated2" '.
|
||||||
|
'tag2="content2" content="' . $description . '" tag3="content3">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// OpenGraph reversed multiple properties start
|
||||||
|
$html = '<meta content="' . $description . '" property="unrelated og:description">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// OpenGraph reversed multiple properties end
|
||||||
|
$html = '<meta content="' . $description . '" property="og:description unrelated">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// OpenGraph reversed multiple properties both end
|
||||||
|
$html = '<meta content="' . $description . '" property="og:unrelated1 og:description og:unrelated2">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// OpenGraph reversed multiple properties both end with noise
|
||||||
|
$html = '<meta tag1="content1" content="' . $description . '" tag2="content2" '.
|
||||||
|
'property="og:unrelated1 og:description og:unrelated2" tag3="content3">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// Suggestion from #1375
|
||||||
|
$html = '<meta property="og:description" name="description" content="' . $description . '">';
|
||||||
|
$this->assertEquals($description, html_extract_tag('description', $html));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -92,6 +162,25 @@ public function testHtmlExtractNonExistentNameTag()
|
||||||
{
|
{
|
||||||
$html = '<html><meta>stuff2</meta><meta name="image" content="img"/></html>';
|
$html = '<html><meta>stuff2</meta><meta name="image" content="img"/></html>';
|
||||||
$this->assertFalse(html_extract_tag('description', $html));
|
$this->assertFalse(html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
// Partial meta tag
|
||||||
|
$html = '<meta content="Brief description">';
|
||||||
|
$this->assertFalse(html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
$html = '<meta property="og:description">';
|
||||||
|
$this->assertFalse(html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
$html = '<meta tag1="content1" property="og:description">';
|
||||||
|
$this->assertFalse(html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
$html = '<meta property="og:description" tag1="content1">';
|
||||||
|
$this->assertFalse(html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
$html = '<meta tag1="content1" content="Brief description">';
|
||||||
|
$this->assertFalse(html_extract_tag('description', $html));
|
||||||
|
|
||||||
|
$html = '<meta content="Brief description" tag1="content1">';
|
||||||
|
$this->assertFalse(html_extract_tag('description', $html));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in a new issue