Hello World!

` becomes `Hello World!`). * @return object A simplehtmldom object of the remaining contents. * * @todo Check if this implementation is still necessary, because simplehtmldom * already removes some of the tags (search for `remove_noise` in simple_html_dom.php). */ function sanitize($html, $tags_to_remove = array('script', 'iframe', 'input', 'form'), $attributes_to_keep = array('title', 'href', 'src'), $text_to_keep = array()){ $htmlContent = str_get_html($html); /* * Notice: simple_html_dom currently doesn't support "->find(*)", which is a * known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/ * * A solution to this is to find all nodes WITHOUT a specific attribute. If * the attribute is very unlikely to appear in the DOM, this is essentially * returning all nodes. * * "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib * "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM. */ foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) { if(in_array($element->tag, $text_to_keep)) { $element->outertext = $element->plaintext; } elseif(in_array($element->tag, $tags_to_remove)) { $element->outertext = ''; } else { foreach($element->getAllAttributes() as $attributeName => $attribute) { if(!in_array($attributeName, $attributes_to_keep)) $element->removeAttribute($attributeName); } } } return $htmlContent; } /** * Replace background by image * * Replaces tags with styles of `backgroud-image` by `` tags. * * For example: * * ```HTML * * *

Hello world!

* * * ``` * * results in this output: * * ```HTML * * * * ``` * * @param string $htmlContent The HTML content * @return string The HTML content with all ocurrences replaced */ function backgroundToImg($htmlContent) { $regex = '/background-image[ ]{0,}:[ ]{0,}url\([\'"]{0,}(.*?)[\'"]{0,}\)/'; $htmlContent = str_get_html($htmlContent); /* * Notice: simple_html_dom currently doesn't support "->find(*)", which is a * known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/ * * A solution to this is to find all nodes WITHOUT a specific attribute. If * the attribute is very unlikely to appear in the DOM, this is essentially * returning all nodes. * * "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib * "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM. */ foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) { if(preg_match($regex, $element->style, $matches) > 0) { $element->outertext = ''; } } return $htmlContent; } /** * Convert relative links in HTML into absolute links * * This function is based on `php-urljoin`. * * @link https://github.com/plaidfluff/php-urljoin php-urljoin * * @param string|object $content The HTML content. Supports HTML objects or string objects * @param string $server Fully qualified URL to the page containing relative links * @return object Content with fixed URLs. * * @todo If the input type was a string, this function should return a string as * well. This is currently done implicitly by how the simplehtmldom object works. */ function defaultLinkTo($content, $server){ $string_convert = false; if (is_string($content)) { $string_convert = true; $content = str_get_html($content); } foreach($content->find('img') as $image) { $image->src = urljoin($server, $image->src); } foreach($content->find('a') as $anchor) { $anchor->href = urljoin($server, $anchor->href); } if ($string_convert) { $content = $content->outertext; } return $content; } /** * Extract the first part of a string matching the specified start and end delimiters * * @param string $string Input string, e.g. `
Post author: John Doe
` * @param string $start Start delimiter, e.g. `author: ` * @param string $end End delimiter, e.g. `<` * @return string|bool Extracted string, e.g. `John Doe`, or false if the * delimiters were not found. */ function extractFromDelimiters($string, $start, $end) { if (strpos($string, $start) !== false) { $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); return $section_retrieved; } return false; } /** * Remove one or more part(s) of a string using a start and end delmiters * * @param string $string Input string, e.g. `foobar` * @param string $start Start delimiter, e.g. `` * @return string Cleaned string, e.g. `foobar` */ function stripWithDelimiters($string, $start, $end) { while(strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } /** * Remove HTML sections containing one or more sections using the same HTML tag * * @param string $string Input string, e.g. `foo
ads
ads
bar` * @param string $tag_name Name of the HTML tag, e.g. `div` * @param string $tag_start Start of the HTML tag to remove, e.g. `
` * @return string Cleaned String, e.g. `foobar` * * @todo This function needs more documentation to make it maintainable. */ function stripRecursiveHTMLSection($string, $tag_name, $tag_start){ $open_tag = '<' . $tag_name; $close_tag = ''; $close_tag_length = strlen($close_tag); if(strpos($tag_start, $open_tag) === 0) { while(strpos($string, $tag_start) !== false) { $max_recursion = 100; $section_to_remove = null; $section_start = strpos($string, $tag_start); $search_offset = $section_start; do { $max_recursion--; $section_end = strpos($string, $close_tag, $search_offset); $search_offset = $section_end + $close_tag_length; $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); $open_tag_count = substr_count($section_to_remove, $open_tag); $close_tag_count = substr_count($section_to_remove, $close_tag); } while ($open_tag_count > $close_tag_count && $max_recursion > 0); $string = str_replace($section_to_remove, '', $string); } } return $string; } /** * Convert Markdown into HTML. Only a subset of the Markdown syntax is implemented. * * @link https://daringfireball.net/projects/markdown/ Markdown * @link https://github.github.com/gfm/ GitHub Flavored Markdown Spec * * @param string $string Input string in Markdown format * @return string output string in HTML format */ function markdownToHtml($string) { //For more details about how these regex work: // https://github.com/RSS-Bridge/rss-bridge/pull/802#discussion_r216138702 // Images: https://regex101.com/r/JW9Evr/1 // Links: https://regex101.com/r/eRGVe7/1 // Bold: https://regex101.com/r/2p40Y0/1 // Italic: https://regex101.com/r/xJkET9/1 // Separator: https://regex101.com/r/ZBEqFP/1 // Plain URL: https://regex101.com/r/2JHYwb/1 // Site name: https://regex101.com/r/qIuKYE/1 $string = preg_replace('/\!\[([^\]]+)\]\(([^\) ]+)(?: [^\)]+)?\)/', '$1', $string); $string = preg_replace('/\[([^\]]+)\]\(([^\)]+)\)/', '$1', $string); $string = preg_replace('/\*\*(.*)\*\*/U', '$1', $string); $string = preg_replace('/\*(.*)\*/U', '$1', $string); $string = preg_replace('/__(.*)__/U', '$1', $string); $string = preg_replace('/_(.*)_/U', '$1', $string); $string = preg_replace('/[-]{6,99}/', '
', $string); $string = str_replace(' ', '
', $string); $string = preg_replace('/([^"])(https?:\/\/[^ "<]+)([^"])/', '$1$2$3', $string . ' '); $string = preg_replace('/([^"\/])(www\.[^ "<]+)([^"])/', '$1$2$3', $string . ' '); //As the regex are not perfect, we need to fix and that are introduced in URLs // Fixup regex : https://regex101.com/r/NTRPf6/1 // Fixup regex : https://regex101.com/r/aNklRp/1 $count = 1; while($count > 0) { $string = preg_replace('/ (src|href)="([^"]+)([^"]+)"/U', ' $1="$2_$3"', $string, -1, $count); } $count = 1; while($count > 0) { $string = preg_replace('/ (src|href)="([^"]+)<\/i>([^"]+)"/U', ' $1="$2_$3"', $string, -1, $count); } return '
' . trim($string) . '
'; }