[html] Clarify meaning of strange find() parameter
simple_html_dom currently doesnt support "->find('*')", which is a known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/ The solution implemented by RSS-Bridge is to find all nodes WITHOUT a specific attribute. If the attribute is very unlikely to appear in the DOM, this is essentially returning all nodes. This is the meaning behind "->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]')"
This commit is contained in:
parent
67004556e6
commit
54800fcc8d
1 changed files with 22 additions and 5 deletions
27
lib/html.php
27
lib/html.php
|
@ -26,8 +26,6 @@
|
||||||
* already removes some of the tags (search for `remove_noise` in simple_html_dom.php).
|
* already removes some of the tags (search for `remove_noise` in simple_html_dom.php).
|
||||||
* @todo Rename parameters to make more sense. `$textToSanitize` must be HTML,
|
* @todo Rename parameters to make more sense. `$textToSanitize` must be HTML,
|
||||||
* `$removedTags`, `$keptAttributes` and `$keptText` are past tense.
|
* `$removedTags`, `$keptAttributes` and `$keptText` are past tense.
|
||||||
* @todo Clarify the meaning of `*[!b38fd2b1fe7f4747d6b1c1254ccd055e]`, which
|
|
||||||
* looks like a SHA1 hash (does simplehtmldom not support `find('*')`?).
|
|
||||||
*/
|
*/
|
||||||
function sanitize($textToSanitize,
|
function sanitize($textToSanitize,
|
||||||
$removedTags = array('script', 'iframe', 'input', 'form'),
|
$removedTags = array('script', 'iframe', 'input', 'form'),
|
||||||
|
@ -35,6 +33,17 @@ $keptAttributes = array('title', 'href', 'src'),
|
||||||
$keptText = array()){
|
$keptText = array()){
|
||||||
$htmlContent = str_get_html($textToSanitize);
|
$htmlContent = str_get_html($textToSanitize);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Notice: simple_html_dom currently doesn't support "->find(*)", which is a
|
||||||
|
* known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/
|
||||||
|
*
|
||||||
|
* A solution to this is to find all nodes WITHOUT a specific attribute. If
|
||||||
|
* the attribute is very unlikely to appear in the DOM, this is essentially
|
||||||
|
* returning all nodes.
|
||||||
|
*
|
||||||
|
* "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib
|
||||||
|
* "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM.
|
||||||
|
*/
|
||||||
foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) {
|
foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) {
|
||||||
if(in_array($element->tag, $keptText)) {
|
if(in_array($element->tag, $keptText)) {
|
||||||
$element->outertext = $element->plaintext;
|
$element->outertext = $element->plaintext;
|
||||||
|
@ -76,15 +85,23 @@ $keptText = array()){
|
||||||
*
|
*
|
||||||
* @param string $htmlContent The HTML content
|
* @param string $htmlContent The HTML content
|
||||||
* @return string The HTML content with all ocurrences replaced
|
* @return string The HTML content with all ocurrences replaced
|
||||||
*
|
|
||||||
* @todo Clarify the meaning of `*[!b38fd2b1fe7f4747d6b1c1254ccd055e]`, which
|
|
||||||
* looks like a SHA1 hash (does simplehtmldom not support `find('*')`?).
|
|
||||||
*/
|
*/
|
||||||
function backgroundToImg($htmlContent) {
|
function backgroundToImg($htmlContent) {
|
||||||
|
|
||||||
$regex = '/background-image[ ]{0,}:[ ]{0,}url\([\'"]{0,}(.*?)[\'"]{0,}\)/';
|
$regex = '/background-image[ ]{0,}:[ ]{0,}url\([\'"]{0,}(.*?)[\'"]{0,}\)/';
|
||||||
$htmlContent = str_get_html($htmlContent);
|
$htmlContent = str_get_html($htmlContent);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Notice: simple_html_dom currently doesn't support "->find(*)", which is a
|
||||||
|
* known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/
|
||||||
|
*
|
||||||
|
* A solution to this is to find all nodes WITHOUT a specific attribute. If
|
||||||
|
* the attribute is very unlikely to appear in the DOM, this is essentially
|
||||||
|
* returning all nodes.
|
||||||
|
*
|
||||||
|
* "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib
|
||||||
|
* "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM.
|
||||||
|
*/
|
||||||
foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) {
|
foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) {
|
||||||
|
|
||||||
if(preg_match($regex, $element->style, $matches) > 0) {
|
if(preg_match($regex, $element->style, $matches) > 0) {
|
||||||
|
|
Loading…
Reference in a new issue