2016-01-04 10:45:54 +01:00
|
|
|
<?php
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Extract title from an HTML document.
|
|
|
|
*
|
|
|
|
* @param string $html HTML content where to look for a title.
|
|
|
|
*
|
|
|
|
* @return bool|string Extracted title if found, false otherwise.
|
|
|
|
*/
|
|
|
|
function html_extract_title($html)
|
|
|
|
{
|
2016-04-06 22:00:52 +02:00
|
|
|
if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
|
|
|
|
return trim(str_replace("\n", '', $matches[1]));
|
2016-01-04 10:45:54 +01:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determine charset from downloaded page.
|
|
|
|
* Priority:
|
|
|
|
* 1. HTTP headers (Content type).
|
|
|
|
* 2. HTML content page (tag <meta charset>).
|
|
|
|
* 3. Use a default charset (default: UTF-8).
|
|
|
|
*
|
|
|
|
* @param array $headers HTTP headers array.
|
|
|
|
* @param string $htmlContent HTML content where to look for charset.
|
|
|
|
* @param string $defaultCharset Default charset to apply if other methods failed.
|
|
|
|
*
|
|
|
|
* @return string Determined charset.
|
|
|
|
*/
|
|
|
|
function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
|
|
|
|
{
|
|
|
|
if ($charset = headers_extract_charset($headers)) {
|
|
|
|
return $charset;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($charset = html_extract_charset($htmlContent)) {
|
|
|
|
return $charset;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $defaultCharset;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Extract charset from HTTP headers if it's defined.
|
|
|
|
*
|
|
|
|
* @param array $headers HTTP headers array.
|
|
|
|
*
|
|
|
|
* @return bool|string Charset string if found (lowercase), false otherwise.
|
|
|
|
*/
|
|
|
|
function headers_extract_charset($headers)
|
|
|
|
{
|
|
|
|
if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
|
|
|
|
preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
|
|
|
|
if (! empty($match[1])) {
|
|
|
|
return strtolower(trim($match[1]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Extract charset HTML content (tag <meta charset>).
|
|
|
|
*
|
|
|
|
* @param string $html HTML content where to look for charset.
|
|
|
|
*
|
|
|
|
* @return bool|string Charset string if found, false otherwise.
|
|
|
|
*/
|
|
|
|
function html_extract_charset($html)
|
|
|
|
{
|
|
|
|
// Get encoding specified in HTML header.
|
2016-04-06 22:00:52 +02:00
|
|
|
preg_match('#<meta .*charset=["\']?([^";\'>/]+)["\']? */?>#Usi', $html, $enc);
|
2016-01-04 10:45:54 +01:00
|
|
|
if (!empty($enc[1])) {
|
|
|
|
return strtolower($enc[1]);
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|