MyShaarli/application/LinkUtils.php

<?php

/**
 * Extract title from an HTML document.
 *
 * @param string $html HTML content where to look for a title.
 *
 * @return bool|string Extracted title if found, false otherwise.
 */
function html_extract_title($html)
{
    if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
        return trim(str_replace("\n", '', $matches[1]));
    }
    return false;
}

/**
 * Determine charset from downloaded page.
 * Priority:
 *   1. HTTP headers (Content type).
 *   2. HTML content page (tag <meta charset>).
 *   3. Use a default charset (default: UTF-8).
 *
 * @param array  $headers           HTTP headers array.
 * @param string $htmlContent       HTML content where to look for charset.
 * @param string $defaultCharset    Default charset to apply if other methods failed.
 *
 * @return string Determined charset.
 */
function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
{
    if ($charset = headers_extract_charset($headers)) {
        return $charset;
    }

    if ($charset = html_extract_charset($htmlContent)) {
        return $charset;
    }

    return $defaultCharset;
}

/**
 * Extract charset from HTTP headers if it's defined.
 *
 * @param array $headers HTTP headers array.
 *
 * @return bool|string Charset string if found (lowercase), false otherwise.
 */
function headers_extract_charset($headers)
{
    if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
        preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
        if (! empty($match[1])) {
            return strtolower(trim($match[1]));
        }
    }

    return false;
}

/**
 * Extract charset HTML content (tag <meta charset>).
 *
 * @param string $html HTML content where to look for charset.
 *
 * @return bool|string Charset string if found, false otherwise.
 */
function html_extract_charset($html)
{
    // Get encoding specified in HTML header.
    preg_match('#<meta .*charset=["\']?([^";\'>/]+)["\']? */?>#Usi', $html, $enc);
    if (!empty($enc[1])) {
        return strtolower($enc[1]);
    }

    return false;
}

/**
 * Count private links in given linklist.
 *
 * @param array $links Linklist.
 *
 * @return int Number of private links.
 */
function count_private($links)
{
    $cpt = 0;
    foreach ($links as $link) {
        $cpt = $link['private'] == true ? $cpt + 1 : $cpt;
    }

    return $cpt;
}

/**
 * In a string, converts URLs to clickable links.
 *
 * @param string $text       input string.
 * @param string $redirector if a redirector is set, use it to gerenate links.
 *
 * @return string returns $text with all links converted to HTML links.
 *
 * @see Function inspired from http://www.php.net/manual/en/function.preg-replace.php#85722
 */
function text2clickable($text, $redirector = '')
{
    $regex = '!(((?:https?|ftp|file)://|apt:|magnet:)\S+[[:alnum:]]/?)!si';

    if (empty($redirector)) {
        return preg_replace($regex, '<a href="$1">$1</a>', $text);
    }
    // Redirector is set, urlencode the final URL.
    return preg_replace_callback(
        $regex,
        function ($matches) use ($redirector) {
            return '<a href="' . $redirector . urlencode($matches[1]) .'">'. $matches[1] .'</a>';
        },
        $text
    );
}

/**
 * Auto-link hashtags.
 *
 * @param string $description Given description.
 * @param string $indexUrl    Root URL.
 *
 * @return string Description with auto-linked hashtags.
 */
function hashtag_autolink($description, $indexUrl = '')
{
    /*
     * To support unicode: http://stackoverflow.com/a/35498078/1484919
     * \p{Pc} - to match underscore
     * \p{N} - numeric character in any script
     * \p{L} - letter from any language
     * \p{Mn} - any non marking space (accents, umlauts, etc)
     */
    $regex = '/(^|\s)#([\p{Pc}\p{N}\p{L}\p{Mn}]+)/mui';
    $replacement = '$1<a href="'. $indexUrl .'?addtag=$2" title="Hashtag $2">#$2</a>';
    return preg_replace($regex, $replacement, $description);
}

/**
 * This function inserts &nbsp; where relevant so that multiple spaces are properly displayed in HTML
 * even in the absence of <pre>  (This is used in description to keep text formatting).
 *
 * @param string $text input text.
 *
 * @return string formatted text.
 */
function space2nbsp($text)
{
    return preg_replace('/(^| ) /m', '$1&nbsp;', $text);
}

/**
 * Format Shaarli's description
 *
 * @param string $description shaare's description.
 * @param string $redirector  if a redirector is set, use it to gerenate links.
 *
 * @return string formatted description.
 */
function format_description($description, $redirector = '', $indexUrl = '') {
    return nl2br(space2nbsp(hashtag_autolink(text2clickable($description, $redirector), $indexUrl)));
}
Fixes #410 - Retrieve title fails in multiple cases * `get_http_url()` renamed to `get_http_response()`. * Use the same HTTP context to retrieve response headers and content. * Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections). * Add `LinkUtils` to extract titles and charset. * Try to retrieve charset from HTTP headers first (new), then HTML content. * Use mb_string to re-encode title if necessary. 2016-01-04 10:45:54 +01:00			`<?php`

			`/**`
			`* Extract title from an HTML document.`
			`*`
			`* @param string $html HTML content where to look for a title.`
			`*`
			`* @return bool\|string Extracted title if found, false otherwise.`
			`*/`
			`function html_extract_title($html)`
			`{`
Fixes #531 - Title retrieving is failing with multiple use case see https://github.com/shaarli/Shaarli/issues/531 for details 2016-04-06 22:00:52 +02:00			`if (preg_match('!<title.?>(.?)</title>!is', $html, $matches)) {`
			`return trim(str_replace("\n", '', $matches[1]));`
Fixes #410 - Retrieve title fails in multiple cases * `get_http_url()` renamed to `get_http_response()`. * Use the same HTTP context to retrieve response headers and content. * Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections). * Add `LinkUtils` to extract titles and charset. * Try to retrieve charset from HTTP headers first (new), then HTML content. * Use mb_string to re-encode title if necessary. 2016-01-04 10:45:54 +01:00			`}`
			`return false;`
			`}`

			`/**`
			`* Determine charset from downloaded page.`
			`* Priority:`
			`* 1. HTTP headers (Content type).`
			`* 2. HTML content page (tag <meta charset>).`
			`* 3. Use a default charset (default: UTF-8).`
			`*`
			`* @param array $headers HTTP headers array.`
			`* @param string $htmlContent HTML content where to look for charset.`
			`* @param string $defaultCharset Default charset to apply if other methods failed.`
			`*`
			`* @return string Determined charset.`
			`*/`
			`function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')`
			`{`
			`if ($charset = headers_extract_charset($headers)) {`
			`return $charset;`
			`}`

			`if ($charset = html_extract_charset($htmlContent)) {`
			`return $charset;`
			`}`

			`return $defaultCharset;`
			`}`

			`/**`
			`* Extract charset from HTTP headers if it's defined.`
			`*`
			`* @param array $headers HTTP headers array.`
			`*`
			`* @return bool\|string Charset string if found (lowercase), false otherwise.`
			`*/`
			`function headers_extract_charset($headers)`
			`{`
			`if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {`
			`preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);`
			`if (! empty($match[1])) {`
			`return strtolower(trim($match[1]));`
			`}`
			`}`

			`return false;`
			`}`

			`/**`
			`* Extract charset HTML content (tag <meta charset>).`
			`*`
			`* @param string $html HTML content where to look for charset.`
			`*`
			`* @return bool\|string Charset string if found, false otherwise.`
			`*/`
			`function html_extract_charset($html)`
			`{`
			`// Get encoding specified in HTML header.`
Fixes #531 - Title retrieving is failing with multiple use case see https://github.com/shaarli/Shaarli/issues/531 for details 2016-04-06 22:00:52 +02:00			`preg_match('#<meta .charset=["\']?([^";\'>/]+)["\']? /?>#Usi', $html, $enc);`
Fixes #410 - Retrieve title fails in multiple cases * `get_http_url()` renamed to `get_http_response()`. * Use the same HTTP context to retrieve response headers and content. * Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections). * Add `LinkUtils` to extract titles and charset. * Try to retrieve charset from HTTP headers first (new), then HTML content. * Use mb_string to re-encode title if necessary. 2016-01-04 10:45:54 +01:00			`if (!empty($enc[1])) {`
			`return strtolower($enc[1]);`
			`}`

			`return false;`
			`}`
Add private link counter 2016-05-11 00:05:22 +02:00
			`/**`
			`* Count private links in given linklist.`
			`*`
			`* @param array $links Linklist.`
			`*`
			`* @return int Number of private links.`
			`*/`
			`function count_private($links)`
			`{`
			`$cpt = 0;`
			`foreach ($links as $link) {`
			`$cpt = $link['private'] == true ? $cpt + 1 : $cpt;`
			`}`
Hashtag system * Hashtag are auto-linked with a filter search * Supports unicode * Compatible with markdown (excluded in code blocks) 2016-05-10 23:18:04 +02:00
Add private link counter 2016-05-11 00:05:22 +02:00			`return $cpt;`
			`}`
Hashtag system * Hashtag are auto-linked with a filter search * Supports unicode * Compatible with markdown (excluded in code blocks) 2016-05-10 23:18:04 +02:00
			`/**`
			`* In a string, converts URLs to clickable links.`
			`*`
			`* @param string $text input string.`
			`* @param string $redirector if a redirector is set, use it to gerenate links.`
			`*`
			`* @return string returns $text with all links converted to HTML links.`
			`*`
			`* @see Function inspired from http://www.php.net/manual/en/function.preg-replace.php#85722`
			`*/`
			`function text2clickable($text, $redirector = '')`
			`{`
			`$regex = '!(((?:https?\|ftp\|file)://\|apt:\|magnet:)\S+[[:alnum:]]/?)!si';`

			`if (empty($redirector)) {`
			`return preg_replace($regex, '<a href="$1">$1</a>', $text);`
			`}`
			`// Redirector is set, urlencode the final URL.`
			`return preg_replace_callback(`
			`$regex,`
			`function ($matches) use ($redirector) {`
			`return '<a href="' . $redirector . urlencode($matches[1]) .'">'. $matches[1] .'</a>';`
			`},`
			`$text`
			`);`
			`}`

			`/**`
			`* Auto-link hashtags.`
			`*`
			`* @param string $description Given description.`
			`* @param string $indexUrl Root URL.`
			`*`
			`* @return string Description with auto-linked hashtags.`
			`*/`
			`function hashtag_autolink($description, $indexUrl = '')`
			`{`
			`/*`
			`* To support unicode: http://stackoverflow.com/a/35498078/1484919`
			`* \p{Pc} - to match underscore`
			`* \p{N} - numeric character in any script`
			`* \p{L} - letter from any language`
			`* \p{Mn} - any non marking space (accents, umlauts, etc)`
			`*/`
			`$regex = '/(^\|\s)#([\p{Pc}\p{N}\p{L}\p{Mn}]+)/mui';`
			`$replacement = '$1<a href="'. $indexUrl .'?addtag=$2" title="Hashtag $2">#$2</a>';`
			`return preg_replace($regex, $replacement, $description);`
			`}`

			`/**`
			`* This function inserts   where relevant so that multiple spaces are properly displayed in HTML`
			`* even in the absence of <pre> (This is used in description to keep text formatting).`
			`*`
			`* @param string $text input text.`
			`*`
			`* @return string formatted text.`
			`*/`
			`function space2nbsp($text)`
			`{`
			`return preg_replace('/(^\| ) /m', '$1 ', $text);`
			`}`

			`/**`
			`* Format Shaarli's description`
			`*`
			`* @param string $description shaare's description.`
			`* @param string $redirector if a redirector is set, use it to gerenate links.`
			`*`
			`* @return string formatted description.`
			`*/`
			`function format_description($description, $redirector = '', $indexUrl = '') {`
			`return nl2br(space2nbsp(hashtag_autolink(text2clickable($description, $redirector), $indexUrl)));`
			`}`