Revert "all: Use ->remove() instead of ->outertext = ''"

This reverts commit 052844f5e1.

There is a bug in ->remove() that causes the parser to incorrectly
identify elements in the DOM tree that shouldn't exist anymore.

References #1151
This commit is contained in:
logmanoriginal 2019-06-02 13:03:26 +02:00
parent 468d8be72d
commit 6c4098d655
21 changed files with 49 additions and 44 deletions

View file

@ -50,18 +50,18 @@ class AsahiShimbunAJWBridge extends BridgeAbstract {
$e_lead = $element->find('span.Lead', 0); $e_lead = $element->find('span.Lead', 0);
if ($e_lead) { if ($e_lead) {
$item['content'] = $e_lead->innertext; $item['content'] = $e_lead->innertext;
$e_lead->remove(); $e_lead->outertext = '';
} else { } else {
$item['content'] = $element->innertext; $item['content'] = $element->innertext;
} }
$e_date = $element->find('span.EnDate', 0); $e_date = $element->find('span.EnDate', 0);
if ($e_date) { if ($e_date) {
$item['timestamp'] = strtotime($e_date->innertext); $item['timestamp'] = strtotime($e_date->innertext);
$e_date->remove(); $e_date->outertext = '';
} }
$e_video = $element->find('span.EnVideo', 0); $e_video = $element->find('span.EnVideo', 0);
if ($e_video) { if ($e_video) {
$e_video->remove(); $e_video->outertext = '';
$element->innertext = "VIDEO: $element->innertext"; $element->innertext = "VIDEO: $element->innertext";
} }
$item['title'] = $element->innertext; $item['title'] = $element->innertext;

View file

@ -55,7 +55,7 @@ class BundesbankBridge extends BridgeAbstract {
$title = $study->find('.teasable__title div.h2', 0); $title = $study->find('.teasable__title div.h2', 0);
foreach($title->children as &$child) { foreach($title->children as &$child) {
$child->remove(); $child->outertext = '';
} }
$item['title'] = $title->innertext; $item['title'] = $title->innertext;

View file

@ -58,7 +58,7 @@ class CastorusBridge extends BridgeAbstract {
returnServerError('Cannot find nodes!'); returnServerError('Cannot find nodes!');
foreach($nodes as $node) { foreach($nodes as $node) {
$node->remove(); $node->outertext = '';
} }
return strtotime($activity->innertext); return strtotime($activity->innertext);

View file

@ -50,7 +50,7 @@ class DauphineLibereBridge extends FeedExpander {
private function extractContent($url){ private function extractContent($url){
$html2 = getSimpleHTMLDOMCached($url); $html2 = getSimpleHTMLDOMCached($url);
foreach ($html2->find('.noprint, link, script, iframe, .shareTool, .contentInfo') as $remove) { foreach ($html2->find('.noprint, link, script, iframe, .shareTool, .contentInfo') as $remove) {
$remove->remove(); $remove->outertext = '';
} }
return $html2->find('div.content', 0)->innertext; return $html2->find('div.content', 0)->innertext;
} }

View file

@ -29,16 +29,16 @@ class EconomistBridge extends BridgeAbstract {
// Remove newsletter subscription box // Remove newsletter subscription box
$newsletter = $content->find('div[class="newsletter-form__message"]', 0); $newsletter = $content->find('div[class="newsletter-form__message"]', 0);
if ($newsletter) if ($newsletter)
$newsletter->remove(); $newsletter->outertext = '';
$newsletterForm = $content->find('form', 0); $newsletterForm = $content->find('form', 0);
if ($newsletterForm) if ($newsletterForm)
$newsletterForm->remove(); $newsletterForm->outertext = '';
// Remove next and previous article URLs at the bottom // Remove next and previous article URLs at the bottom
$nextprev = $content->find('div[class="blog-post__next-previous-wrapper"]', 0); $nextprev = $content->find('div[class="blog-post__next-previous-wrapper"]', 0);
if ($nextprev) if ($nextprev)
$nextprev->remove(); $nextprev->outertext = '';
$section = [ $article->find('h3[itemprop="articleSection"]', 0)->plaintext ]; $section = [ $article->find('h3[itemprop="articleSection"]', 0)->plaintext ];

View file

@ -584,7 +584,7 @@ EOD;
foreach($content_filters as $filter) { foreach($content_filters as $filter) {
foreach($content->find($filter) as $subject) { foreach($content->find($filter) as $subject) {
$subject->remove(); $subject->outertext = '';
} }
} }

View file

@ -50,7 +50,7 @@ class HaveIBeenPwnedBridge extends BridgeAbstract {
$permalink = $breach->find('p', 1)->find('a', 0)->href; $permalink = $breach->find('p', 1)->find('a', 0)->href;
// Remove permalink // Remove permalink
$breach->find('p', 1)->find('a', 0)->remove(); $breach->find('p', 1)->find('a', 0)->outertext = '';
$item['title'] = $breach->find('h3', 0)->plaintext . ' - ' . $accounts[1] . ' breached accounts'; $item['title'] = $breach->find('h3', 0)->plaintext . ' - ' . $accounts[1] . ' breached accounts';
$item['dateAdded'] = strtotime($dateAdded[1]); $item['dateAdded'] = strtotime($dateAdded[1]);

View file

@ -239,16 +239,16 @@ class JustETFBridge extends BridgeAbstract {
or returnServerError('Article body not found!'); or returnServerError('Article body not found!');
// Remove teaser image // Remove teaser image
$element->find('img.teaser-img', 0)->remove(); $element->find('img.teaser-img', 0)->outertext = '';
// Remove self advertisements // Remove self advertisements
foreach($element->find('.call-action') as $adv) { foreach($element->find('.call-action') as $adv) {
$adv->remove(); $adv->outertext = '';
} }
// Remove tips // Remove tips
foreach($element->find('.panel-edu') as $tip) { foreach($element->find('.panel-edu') as $tip) {
$tip->remove(); $tip->outertext = '';
} }
// Remove inline scripts (used for i.e. interactive graphs) as they are // Remove inline scripts (used for i.e. interactive graphs) as they are
@ -318,7 +318,7 @@ class JustETFBridge extends BridgeAbstract {
$description = $description->parent(); $description = $description->parent();
foreach($description->find('div') as $div) { foreach($description->find('div') as $div) {
$div->remove(); $div->outertext = '';
} }
$quote = $html->find('div.infobox div.val', 0) $quote = $html->find('div.infobox div.val', 0)

View file

@ -61,7 +61,7 @@ class NextgovBridge extends FeedExpander {
return 'Could not request Nextgov: ' . $url; return 'Could not request Nextgov: ' . $url;
$contents = $article->find('div.wysiwyg', 0); $contents = $article->find('div.wysiwyg', 0);
$contents->find('svg.content-tombstone', 0)->remove(); $contents->find('svg.content-tombstone', 0)->outertext = '';
$contents = $contents->innertext; $contents = $contents->innertext;
$contents = stripWithDelimiters($contents, '<div class="ad-container">', '</div>'); $contents = stripWithDelimiters($contents, '<div class="ad-container">', '</div>');
$contents = stripWithDelimiters($contents, '<div', '</div>'); //ad outer div $contents = stripWithDelimiters($contents, '<div', '</div>'); //ad outer div

View file

@ -51,7 +51,7 @@ class OsmAndBlogBridge extends BridgeAbstract {
private function cleanupContent($content, ...$removeItems) { private function cleanupContent($content, ...$removeItems) {
foreach ($removeItems as $obj) { foreach ($removeItems as $obj) {
if ($obj) $obj->remove(); if ($obj) $obj->outertext = '';
} }
foreach ($content->find('img') as $obj) { foreach ($content->find('img') as $obj) {
$obj->src = $this->filterURL($obj->src); $obj->src = $this->filterURL($obj->src);

View file

@ -83,7 +83,7 @@ class PikabuBridge extends BridgeAbstract {
foreach($el_to_remove_selectors as $el_to_remove_selector) { foreach($el_to_remove_selectors as $el_to_remove_selector) {
foreach($post->find($el_to_remove_selector) as $el) { foreach($post->find($el_to_remove_selector) as $el) {
$el->remove(); $el->outertext = '';
} }
} }

View file

@ -38,17 +38,20 @@ class RadioMelodieBridge extends BridgeAbstract {
$imgs = $textDOM->find('img[src^="http://www.radiomelodie.com/image.php]'); $imgs = $textDOM->find('img[src^="http://www.radiomelodie.com/image.php]');
foreach($imgs as $img) { foreach($imgs as $img) {
$img->src = $this->rewriteImage($img->src); $img->src = $this->rewriteImage($img->src);
$article->save();
} }
// Remove Google Ads // Remove Google Ads
$ads = $article->find('div[class=adInline]'); $ads = $article->find('div[class=adInline]');
foreach($ads as $ad) { foreach($ads as $ad) {
$ad->remove(); $ad->outertext = '';
$article->save();
} }
// Remove Radio Melodie Logo // Remove Radio Melodie Logo
$logoHTML = $article->find('div[id=logoArticleRM]', 0); $logoHTML = $article->find('div[id=logoArticleRM]', 0);
$logoHTML->remove(); $logoHTML->outertext = '';
$article->save();
$author = $article->find('p[class=AuthorName]', 0)->plaintext; $author = $article->find('p[class=AuthorName]', 0)->plaintext;
@ -62,7 +65,8 @@ class RadioMelodieBridge extends BridgeAbstract {
$header = '<img src="' . $picture[0] . '"/>'; $header = '<img src="' . $picture[0] . '"/>';
// Remove the Date and Author part // Remove the Date and Author part
$textDOM->find('div[class=AuthorDate]', 0)->remove(); $textDOM->find('div[class=AuthorDate]', 0)->outertext = '';
$article->save();
$text = $textDOM->innertext; $text = $textDOM->innertext;
$item['content'] = '<h1>' . $item['title'] . '</h1>' . $date . '<br/>' . $header . $text; $item['content'] = '<h1>' . $item['title'] . '</h1>' . $date . '<br/>' . $header . $text;
$this->items[] = $item; $this->items[] = $item;

View file

@ -48,7 +48,7 @@ class SIMARBridge extends BridgeAbstract {
foreach($e_item->find('p') as $paragraph) { foreach($e_item->find('p') as $paragraph) {
/* Remove empty paragraphs */ /* Remove empty paragraphs */
if (preg_match('/^(\W|&nbsp;)+$/', $paragraph->innertext) == 1) { if (preg_match('/^(\W|&nbsp;)+$/', $paragraph->innertext) == 1) {
$paragraph->remove(); $paragraph->outertext = '';
} }
} }
if ($e_item) { if ($e_item) {

View file

@ -18,7 +18,7 @@ class ScmbBridge extends BridgeAbstract {
$item['title'] = $article->find('header h1 a', 0)->innertext; $item['title'] = $article->find('header h1 a', 0)->innertext;
// remove text "En savoir plus" from anecdote content // remove text "En savoir plus" from anecdote content
$article->find('span.read-more', 0)->remove(); $article->find('span.read-more', 0)->outertext = '';
$content = $article->find('p.summary a', 0)->innertext; $content = $article->find('p.summary a', 0)->innertext;
// remove superfluous spaces at the end // remove superfluous spaces at the end

View file

@ -171,7 +171,7 @@ class TwitterBridge extends BridgeAbstract {
// remove 'invisible' content // remove 'invisible' content
foreach($tweet->find('.invisible') as $invisible) { foreach($tweet->find('.invisible') as $invisible) {
$invisible->remove(); $invisible->outertext = '';
} }
// Skip protmoted tweets // Skip protmoted tweets

View file

@ -62,8 +62,9 @@ class VkBridge extends BridgeAbstract
$this->pageName = htmlspecialchars_decode($pageName); $this->pageName = htmlspecialchars_decode($pageName);
} }
foreach ($html->find('div.replies') as $comment_block) { foreach ($html->find('div.replies') as $comment_block) {
$comment_block->remove(); $comment_block->outertext = '';
} }
$html->load($html->save());
$pinned_post_item = null; $pinned_post_item = null;
$last_post_id = 0; $last_post_id = 0;
@ -81,7 +82,7 @@ class VkBridge extends BridgeAbstract
if (is_object($post->find('a.wall_post_more', 0))) { if (is_object($post->find('a.wall_post_more', 0))) {
//delete link "show full" in content //delete link "show full" in content
$post->find('a.wall_post_more', 0)->remove(); $post->find('a.wall_post_more', 0)->outertext = '';
} }
$content_suffix = ''; $content_suffix = '';
@ -113,7 +114,7 @@ class VkBridge extends BridgeAbstract
foreach($external_link_selectors_to_remove as $sel) { foreach($external_link_selectors_to_remove as $sel) {
if (is_object($post->find($sel, 0))) { if (is_object($post->find($sel, 0))) {
$post->find($sel, 0)->remove(); $post->find($sel, 0)->outertext = '';
} }
} }
@ -139,7 +140,7 @@ class VkBridge extends BridgeAbstract
$content_suffix .= "<br><img src='" . $matches[1] . "'>"; $content_suffix .= "<br><img src='" . $matches[1] . "'>";
} }
$content_suffix .= "<br>Article: <a href='$article_link'>$article_title ($article_author)</a>"; $content_suffix .= "<br>Article: <a href='$article_link'>$article_title ($article_author)</a>";
$article->remove(); $article->outertext = '';
} }
// get video on post // get video on post
@ -149,7 +150,7 @@ class VkBridge extends BridgeAbstract
$video_title = $video->find('div.post_video_title', 0)->plaintext; $video_title = $video->find('div.post_video_title', 0)->plaintext;
$video_link = $video->find('a.lnk', 0)->getAttribute('href'); $video_link = $video->find('a.lnk', 0)->getAttribute('href');
$this->appendVideo($video_title, $video_link, $content_suffix, $post_videos); $this->appendVideo($video_title, $video_link, $content_suffix, $post_videos);
$video->remove(); $video->outertext = '';
$main_video_link = $video_link; $main_video_link = $video_link;
} }
@ -160,14 +161,14 @@ class VkBridge extends BridgeAbstract
if (count($temp) > 1) $video_title = $temp[1]; if (count($temp) > 1) $video_title = $temp[1];
$video_link = $a->getAttribute('href'); $video_link = $a->getAttribute('href');
if ($video_link != $main_video_link) $this->appendVideo($video_title, $video_link, $content_suffix, $post_videos); if ($video_link != $main_video_link) $this->appendVideo($video_title, $video_link, $content_suffix, $post_videos);
$a->remove(); $a->outertext = '';
} }
// get all photos // get all photos
foreach($post->find('div.wall_text > a.page_post_thumb_wrap') as $a) { foreach($post->find('div.wall_text > a.page_post_thumb_wrap') as $a) {
$result = $this->getPhoto($a); $result = $this->getPhoto($a);
if ($result == null) continue; if ($result == null) continue;
$a->remove(); $a->outertext = '';
$content_suffix .= "<br>$result"; $content_suffix .= "<br>$result";
} }
@ -176,7 +177,7 @@ class VkBridge extends BridgeAbstract
$a = $el->find('.page_album_link', 0); $a = $el->find('.page_album_link', 0);
$album_title = $a->find('.page_album_title_text', 0)->getAttribute('title'); $album_title = $a->find('.page_album_title_text', 0)->getAttribute('title');
$album_link = $a->getAttribute('href'); $album_link = $a->getAttribute('href');
$el->remove(); $el->outertext = '';
$content_suffix .= "<br>Album: <a href='$album_link'>$album_title</a>"; $content_suffix .= "<br>Album: <a href='$album_link'>$album_title</a>";
} }
@ -199,7 +200,7 @@ class VkBridge extends BridgeAbstract
} }
$a->remove(); $a->outertext = '';
} }
// get other documents // get other documents
@ -216,7 +217,7 @@ class VkBridge extends BridgeAbstract
} }
$div->remove(); $div->outertext = '';
} }
// get polls // get polls
@ -226,14 +227,14 @@ class VkBridge extends BridgeAbstract
foreach($div->find('div.page_poll_text') as $poll_stat_title) { foreach($div->find('div.page_poll_text') as $poll_stat_title) {
$content_suffix .= '<br>- ' . $poll_stat_title->innertext; $content_suffix .= '<br>- ' . $poll_stat_title->innertext;
} }
$div->remove(); $div->outertext = '';
} }
// get sign // get sign
$post_author = $pageName; $post_author = $pageName;
foreach($post->find('a.wall_signed_by') as $a) { foreach($post->find('a.wall_signed_by') as $a) {
$post_author = $a->innertext; $post_author = $a->innertext;
$a->remove(); $a->outertext = '';
} }
if (is_object($post->find('div.copy_quote', 0))) { if (is_object($post->find('div.copy_quote', 0))) {
@ -242,7 +243,7 @@ class VkBridge extends BridgeAbstract
} }
$copy_quote = $post->find('div.copy_quote', 0); $copy_quote = $post->find('div.copy_quote', 0);
if ($copy_post_header = $copy_quote->find('div.copy_post_header', 0)) { if ($copy_post_header = $copy_quote->find('div.copy_post_header', 0)) {
$copy_post_header->remove(); $copy_post_header->outertext = '';
} }
$copy_quote_content = $copy_quote->innertext; $copy_quote_content = $copy_quote->innertext;
$copy_quote->outertext = "<br>Reposted: <br>$copy_quote_content"; $copy_quote->outertext = "<br>Reposted: <br>$copy_quote_content";

View file

@ -141,7 +141,7 @@ class WikipediaBridge extends BridgeAbstract {
$anchorFallbackIndex = 0){ $anchorFallbackIndex = 0){
// Clean the bottom of the featured article // Clean the bottom of the featured article
if ($element->find('div', -1)) if ($element->find('div', -1))
$element->find('div', -1)->remove(); $element->find('div', -1)->outertext = '';
// The title and URI of the article can be found in an anchor containing // The title and URI of the article can be found in an anchor containing
// the string '...' in most wikis ('full article ...') // the string '...' in most wikis ('full article ...')
@ -202,10 +202,10 @@ class WikipediaBridge extends BridgeAbstract {
// Let's remove a couple of things from the article // Let's remove a couple of things from the article
$table = $content->find('#toc', 0); // Table of contents $table = $content->find('#toc', 0); // Table of contents
if(!$table === false) if(!$table === false)
$table->remove(); $table->outertext = '';
foreach($content->find('ol.references') as $reference) // References foreach($content->find('ol.references') as $reference) // References
$reference->remove(); $reference->outertext = '';
return str_replace('href="/', 'href="' . $this->getURI() . '/', $content->innertext); return str_replace('href="/', 'href="' . $this->getURI() . '/', $content->innertext);
} }

View file

@ -50,7 +50,7 @@ class WordPressBridge extends FeedExpander {
foreach ($article->find('h1.entry-title') as $title) foreach ($article->find('h1.entry-title') as $title)
if ($title->plaintext == $item['title']) if ($title->plaintext == $item['title'])
$title->remove(); $title->outertext = '';
$article_image = $article_html->find('img.wp-post-image', 0); $article_image = $article_html->find('img.wp-post-image', 0);
if(!empty($item['content']) && (!is_object($article_image) || empty($article_image->src))) { if(!empty($item['content']) && (!is_object($article_image) || empty($article_image->src))) {

View file

@ -44,7 +44,7 @@ class WorldOfTanksBridge extends FeedExpander {
// Remove the scripts, please // Remove the scripts, please
foreach($content->find('script') as $script) { foreach($content->find('script') as $script) {
$script->remove(); $script->outertext = '';
} }
return $content->innertext; return $content->innertext;

View file

@ -193,7 +193,7 @@ class XenForoBridge extends BridgeAbstract {
// Remove script tags // Remove script tags
foreach($content->find('script') as $script) { foreach($content->find('script') as $script) {
$script->remove(); $script->outertext = '';
} }
$item['content'] = $content->innertext; $item['content'] = $content->innertext;

View file

@ -36,7 +36,7 @@ function sanitize($html,
if(in_array($element->tag, $text_to_keep)) { if(in_array($element->tag, $text_to_keep)) {
$element->outertext = $element->plaintext; $element->outertext = $element->plaintext;
} elseif(in_array($element->tag, $tags_to_remove)) { } elseif(in_array($element->tag, $tags_to_remove)) {
$element->remove(); $element->outertext = '';
} else { } else {
foreach($element->getAllAttributes() as $attributeName => $attribute) { foreach($element->getAllAttributes() as $attributeName => $attribute) {
if(!in_array($attributeName, $attributes_to_keep)) if(!in_array($attributeName, $attributes_to_keep))