MyShaarli/application/LinkUtils.php

<?php

/**
 * Extract title from an HTML document.
 *
 * @param string $html HTML content where to look for a title.
 *
 * @return bool|string Extracted title if found, false otherwise.
 */
function html_extract_title($html)
{
    if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
        return trim(str_replace("\n", '', $matches[1]));
    }
    return false;
}

/**
 * Determine charset from downloaded page.
 * Priority:
 *   1. HTTP headers (Content type).
 *   2. HTML content page (tag <meta charset>).
 *   3. Use a default charset (default: UTF-8).
 *
 * @param array  $headers           HTTP headers array.
 * @param string $htmlContent       HTML content where to look for charset.
 * @param string $defaultCharset    Default charset to apply if other methods failed.
 *
 * @return string Determined charset.
 */
function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
{
    if ($charset = headers_extract_charset($headers)) {
        return $charset;
    }

    if ($charset = html_extract_charset($htmlContent)) {
        return $charset;
    }

    return $defaultCharset;
}

/**
 * Extract charset from HTTP headers if it's defined.
 *
 * @param array $headers HTTP headers array.
 *
 * @return bool|string Charset string if found (lowercase), false otherwise.
 */
function headers_extract_charset($headers)
{
    if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
        preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
        if (! empty($match[1])) {
            return strtolower(trim($match[1]));
        }
    }

    return false;
}

/**
 * Extract charset HTML content (tag <meta charset>).
 *
 * @param string $html HTML content where to look for charset.
 *
 * @return bool|string Charset string if found, false otherwise.
 */
function html_extract_charset($html)
{
    // Get encoding specified in HTML header.
    preg_match('#<meta .*charset=["\']?([^";\'>/]+)["\']? */?>#Usi', $html, $enc);
    if (!empty($enc[1])) {
        return strtolower($enc[1]);
    }

    return false;
}

/**
 * Count private links in given linklist.
 *
 * @param array $links Linklist.
 *
 * @return int Number of private links.
 */
function count_private($links)
{
    $cpt = 0;
    foreach ($links as $link) {
        $cpt = $link['private'] == true ? $cpt + 1 : $cpt;
    }
    return $cpt;
}
Fixes #410 - Retrieve title fails in multiple cases * `get_http_url()` renamed to `get_http_response()`. * Use the same HTTP context to retrieve response headers and content. * Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections). * Add `LinkUtils` to extract titles and charset. * Try to retrieve charset from HTTP headers first (new), then HTML content. * Use mb_string to re-encode title if necessary. 2016-01-04 10:45:54 +01:00			`<?php`

			`/**`
			`* Extract title from an HTML document.`
			`*`
			`* @param string $html HTML content where to look for a title.`
			`*`
			`* @return bool\|string Extracted title if found, false otherwise.`
			`*/`
			`function html_extract_title($html)`
			`{`
Fixes #531 - Title retrieving is failing with multiple use case see https://github.com/shaarli/Shaarli/issues/531 for details 2016-04-06 22:00:52 +02:00			`if (preg_match('!<title.?>(.?)</title>!is', $html, $matches)) {`
			`return trim(str_replace("\n", '', $matches[1]));`
Fixes #410 - Retrieve title fails in multiple cases * `get_http_url()` renamed to `get_http_response()`. * Use the same HTTP context to retrieve response headers and content. * Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections). * Add `LinkUtils` to extract titles and charset. * Try to retrieve charset from HTTP headers first (new), then HTML content. * Use mb_string to re-encode title if necessary. 2016-01-04 10:45:54 +01:00			`}`
			`return false;`
			`}`

			`/**`
			`* Determine charset from downloaded page.`
			`* Priority:`
			`* 1. HTTP headers (Content type).`
			`* 2. HTML content page (tag <meta charset>).`
			`* 3. Use a default charset (default: UTF-8).`
			`*`
			`* @param array $headers HTTP headers array.`
			`* @param string $htmlContent HTML content where to look for charset.`
			`* @param string $defaultCharset Default charset to apply if other methods failed.`
			`*`
			`* @return string Determined charset.`
			`*/`
			`function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')`
			`{`
			`if ($charset = headers_extract_charset($headers)) {`
			`return $charset;`
			`}`

			`if ($charset = html_extract_charset($htmlContent)) {`
			`return $charset;`
			`}`

			`return $defaultCharset;`
			`}`

			`/**`
			`* Extract charset from HTTP headers if it's defined.`
			`*`
			`* @param array $headers HTTP headers array.`
			`*`
			`* @return bool\|string Charset string if found (lowercase), false otherwise.`
			`*/`
			`function headers_extract_charset($headers)`
			`{`
			`if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {`
			`preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);`
			`if (! empty($match[1])) {`
			`return strtolower(trim($match[1]));`
			`}`
			`}`

			`return false;`
			`}`

			`/**`
			`* Extract charset HTML content (tag <meta charset>).`
			`*`
			`* @param string $html HTML content where to look for charset.`
			`*`
			`* @return bool\|string Charset string if found, false otherwise.`
			`*/`
			`function html_extract_charset($html)`
			`{`
			`// Get encoding specified in HTML header.`
Fixes #531 - Title retrieving is failing with multiple use case see https://github.com/shaarli/Shaarli/issues/531 for details 2016-04-06 22:00:52 +02:00			`preg_match('#<meta .charset=["\']?([^";\'>/]+)["\']? /?>#Usi', $html, $enc);`
Fixes #410 - Retrieve title fails in multiple cases * `get_http_url()` renamed to `get_http_response()`. * Use the same HTTP context to retrieve response headers and content. * Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections). * Add `LinkUtils` to extract titles and charset. * Try to retrieve charset from HTTP headers first (new), then HTML content. * Use mb_string to re-encode title if necessary. 2016-01-04 10:45:54 +01:00			`if (!empty($enc[1])) {`
			`return strtolower($enc[1]);`
			`}`

			`return false;`
			`}`
Add private link counter 2016-05-11 00:05:22 +02:00
			`/**`
			`* Count private links in given linklist.`
			`*`
			`* @param array $links Linklist.`
			`*`
			`* @return int Number of private links.`
			`*/`
			`function count_private($links)`
			`{`
			`$cpt = 0;`
			`foreach ($links as $link) {`
			`$cpt = $link['private'] == true ? $cpt + 1 : $cpt;`
			`}`
			`return $cpt;`
			`}`