Revert "all: Use ->remove() instead of ->outertext = ''"

This reverts commit 052844f5e1.

There is a bug in ->remove() that causes the parser to incorrectly
identify elements in the DOM tree that shouldn't exist anymore.

References #1151
This commit is contained in:
logmanoriginal 2019-06-02 13:03:26 +02:00
parent 468d8be72d
commit 6c4098d655
21 changed files with 49 additions and 44 deletions

View file

@ -50,18 +50,18 @@ class AsahiShimbunAJWBridge extends BridgeAbstract {
$e_lead = $element->find('span.Lead', 0);
if ($e_lead) {
$item['content'] = $e_lead->innertext;
$e_lead->remove();
$e_lead->outertext = '';
} else {
$item['content'] = $element->innertext;
}
$e_date = $element->find('span.EnDate', 0);
if ($e_date) {
$item['timestamp'] = strtotime($e_date->innertext);
$e_date->remove();
$e_date->outertext = '';
}
$e_video = $element->find('span.EnVideo', 0);
if ($e_video) {
$e_video->remove();
$e_video->outertext = '';
$element->innertext = "VIDEO: $element->innertext";
}
$item['title'] = $element->innertext;

View file

@ -55,7 +55,7 @@ class BundesbankBridge extends BridgeAbstract {
$title = $study->find('.teasable__title div.h2', 0);
foreach($title->children as &$child) {
$child->remove();
$child->outertext = '';
}
$item['title'] = $title->innertext;

View file

@ -58,7 +58,7 @@ class CastorusBridge extends BridgeAbstract {
returnServerError('Cannot find nodes!');
foreach($nodes as $node) {
$node->remove();
$node->outertext = '';
}
return strtotime($activity->innertext);

View file

@ -50,7 +50,7 @@ class DauphineLibereBridge extends FeedExpander {
private function extractContent($url){
$html2 = getSimpleHTMLDOMCached($url);
foreach ($html2->find('.noprint, link, script, iframe, .shareTool, .contentInfo') as $remove) {
$remove->remove();
$remove->outertext = '';
}
return $html2->find('div.content', 0)->innertext;
}

View file

@ -29,16 +29,16 @@ class EconomistBridge extends BridgeAbstract {
// Remove newsletter subscription box
$newsletter = $content->find('div[class="newsletter-form__message"]', 0);
if ($newsletter)
$newsletter->remove();
$newsletter->outertext = '';
$newsletterForm = $content->find('form', 0);
if ($newsletterForm)
$newsletterForm->remove();
$newsletterForm->outertext = '';
// Remove next and previous article URLs at the bottom
$nextprev = $content->find('div[class="blog-post__next-previous-wrapper"]', 0);
if ($nextprev)
$nextprev->remove();
$nextprev->outertext = '';
$section = [ $article->find('h3[itemprop="articleSection"]', 0)->plaintext ];

View file

@ -584,7 +584,7 @@ EOD;
foreach($content_filters as $filter) {
foreach($content->find($filter) as $subject) {
$subject->remove();
$subject->outertext = '';
}
}

View file

@ -50,7 +50,7 @@ class HaveIBeenPwnedBridge extends BridgeAbstract {
$permalink = $breach->find('p', 1)->find('a', 0)->href;
// Remove permalink
$breach->find('p', 1)->find('a', 0)->remove();
$breach->find('p', 1)->find('a', 0)->outertext = '';
$item['title'] = $breach->find('h3', 0)->plaintext . ' - ' . $accounts[1] . ' breached accounts';
$item['dateAdded'] = strtotime($dateAdded[1]);

View file

@ -239,16 +239,16 @@ class JustETFBridge extends BridgeAbstract {
or returnServerError('Article body not found!');
// Remove teaser image
$element->find('img.teaser-img', 0)->remove();
$element->find('img.teaser-img', 0)->outertext = '';
// Remove self advertisements
foreach($element->find('.call-action') as $adv) {
$adv->remove();
$adv->outertext = '';
}
// Remove tips
foreach($element->find('.panel-edu') as $tip) {
$tip->remove();
$tip->outertext = '';
}
// Remove inline scripts (used for i.e. interactive graphs) as they are
@ -318,7 +318,7 @@ class JustETFBridge extends BridgeAbstract {
$description = $description->parent();
foreach($description->find('div') as $div) {
$div->remove();
$div->outertext = '';
}
$quote = $html->find('div.infobox div.val', 0)

View file

@ -61,7 +61,7 @@ class NextgovBridge extends FeedExpander {
return 'Could not request Nextgov: ' . $url;
$contents = $article->find('div.wysiwyg', 0);
$contents->find('svg.content-tombstone', 0)->remove();
$contents->find('svg.content-tombstone', 0)->outertext = '';
$contents = $contents->innertext;
$contents = stripWithDelimiters($contents, '<div class="ad-container">', '</div>');
$contents = stripWithDelimiters($contents, '<div', '</div>'); //ad outer div

View file

@ -51,7 +51,7 @@ class OsmAndBlogBridge extends BridgeAbstract {
private function cleanupContent($content, ...$removeItems) {
foreach ($removeItems as $obj) {
if ($obj) $obj->remove();
if ($obj) $obj->outertext = '';
}
foreach ($content->find('img') as $obj) {
$obj->src = $this->filterURL($obj->src);

View file

@ -83,7 +83,7 @@ class PikabuBridge extends BridgeAbstract {
foreach($el_to_remove_selectors as $el_to_remove_selector) {
foreach($post->find($el_to_remove_selector) as $el) {
$el->remove();
$el->outertext = '';
}
}

View file

@ -38,17 +38,20 @@ class RadioMelodieBridge extends BridgeAbstract {
$imgs = $textDOM->find('img[src^="http://www.radiomelodie.com/image.php]');
foreach($imgs as $img) {
$img->src = $this->rewriteImage($img->src);
$article->save();
}
// Remove Google Ads
$ads = $article->find('div[class=adInline]');
foreach($ads as $ad) {
$ad->remove();
$ad->outertext = '';
$article->save();
}
// Remove Radio Melodie Logo
$logoHTML = $article->find('div[id=logoArticleRM]', 0);
$logoHTML->remove();
$logoHTML->outertext = '';
$article->save();
$author = $article->find('p[class=AuthorName]', 0)->plaintext;
@ -62,7 +65,8 @@ class RadioMelodieBridge extends BridgeAbstract {
$header = '<img src="' . $picture[0] . '"/>';
// Remove the Date and Author part
$textDOM->find('div[class=AuthorDate]', 0)->remove();
$textDOM->find('div[class=AuthorDate]', 0)->outertext = '';
$article->save();
$text = $textDOM->innertext;
$item['content'] = '<h1>' . $item['title'] . '</h1>' . $date . '<br/>' . $header . $text;
$this->items[] = $item;

View file

@ -48,7 +48,7 @@ class SIMARBridge extends BridgeAbstract {
foreach($e_item->find('p') as $paragraph) {
/* Remove empty paragraphs */
if (preg_match('/^(\W|&nbsp;)+$/', $paragraph->innertext) == 1) {
$paragraph->remove();
$paragraph->outertext = '';
}
}
if ($e_item) {

View file

@ -18,7 +18,7 @@ class ScmbBridge extends BridgeAbstract {
$item['title'] = $article->find('header h1 a', 0)->innertext;
// remove text "En savoir plus" from anecdote content
$article->find('span.read-more', 0)->remove();
$article->find('span.read-more', 0)->outertext = '';
$content = $article->find('p.summary a', 0)->innertext;
// remove superfluous spaces at the end

View file

@ -171,7 +171,7 @@ class TwitterBridge extends BridgeAbstract {
// remove 'invisible' content
foreach($tweet->find('.invisible') as $invisible) {
$invisible->remove();
$invisible->outertext = '';
}
// Skip protmoted tweets

View file

@ -62,8 +62,9 @@ class VkBridge extends BridgeAbstract
$this->pageName = htmlspecialchars_decode($pageName);
}
foreach ($html->find('div.replies') as $comment_block) {
$comment_block->remove();
$comment_block->outertext = '';
}
$html->load($html->save());
$pinned_post_item = null;
$last_post_id = 0;
@ -81,7 +82,7 @@ class VkBridge extends BridgeAbstract
if (is_object($post->find('a.wall_post_more', 0))) {
//delete link "show full" in content
$post->find('a.wall_post_more', 0)->remove();
$post->find('a.wall_post_more', 0)->outertext = '';
}
$content_suffix = '';
@ -113,7 +114,7 @@ class VkBridge extends BridgeAbstract
foreach($external_link_selectors_to_remove as $sel) {
if (is_object($post->find($sel, 0))) {
$post->find($sel, 0)->remove();
$post->find($sel, 0)->outertext = '';
}
}
@ -139,7 +140,7 @@ class VkBridge extends BridgeAbstract
$content_suffix .= "<br><img src='" . $matches[1] . "'>";
}
$content_suffix .= "<br>Article: <a href='$article_link'>$article_title ($article_author)</a>";
$article->remove();
$article->outertext = '';
}
// get video on post
@ -149,7 +150,7 @@ class VkBridge extends BridgeAbstract
$video_title = $video->find('div.post_video_title', 0)->plaintext;
$video_link = $video->find('a.lnk', 0)->getAttribute('href');
$this->appendVideo($video_title, $video_link, $content_suffix, $post_videos);
$video->remove();
$video->outertext = '';
$main_video_link = $video_link;
}
@ -160,14 +161,14 @@ class VkBridge extends BridgeAbstract
if (count($temp) > 1) $video_title = $temp[1];
$video_link = $a->getAttribute('href');
if ($video_link != $main_video_link) $this->appendVideo($video_title, $video_link, $content_suffix, $post_videos);
$a->remove();
$a->outertext = '';
}
// get all photos
foreach($post->find('div.wall_text > a.page_post_thumb_wrap') as $a) {
$result = $this->getPhoto($a);
if ($result == null) continue;
$a->remove();
$a->outertext = '';
$content_suffix .= "<br>$result";
}
@ -176,7 +177,7 @@ class VkBridge extends BridgeAbstract
$a = $el->find('.page_album_link', 0);
$album_title = $a->find('.page_album_title_text', 0)->getAttribute('title');
$album_link = $a->getAttribute('href');
$el->remove();
$el->outertext = '';
$content_suffix .= "<br>Album: <a href='$album_link'>$album_title</a>";
}
@ -199,7 +200,7 @@ class VkBridge extends BridgeAbstract
}
$a->remove();
$a->outertext = '';
}
// get other documents
@ -216,7 +217,7 @@ class VkBridge extends BridgeAbstract
}
$div->remove();
$div->outertext = '';
}
// get polls
@ -226,14 +227,14 @@ class VkBridge extends BridgeAbstract
foreach($div->find('div.page_poll_text') as $poll_stat_title) {
$content_suffix .= '<br>- ' . $poll_stat_title->innertext;
}
$div->remove();
$div->outertext = '';
}
// get sign
$post_author = $pageName;
foreach($post->find('a.wall_signed_by') as $a) {
$post_author = $a->innertext;
$a->remove();
$a->outertext = '';
}
if (is_object($post->find('div.copy_quote', 0))) {
@ -242,7 +243,7 @@ class VkBridge extends BridgeAbstract
}
$copy_quote = $post->find('div.copy_quote', 0);
if ($copy_post_header = $copy_quote->find('div.copy_post_header', 0)) {
$copy_post_header->remove();
$copy_post_header->outertext = '';
}
$copy_quote_content = $copy_quote->innertext;
$copy_quote->outertext = "<br>Reposted: <br>$copy_quote_content";

View file

@ -141,7 +141,7 @@ class WikipediaBridge extends BridgeAbstract {
$anchorFallbackIndex = 0){
// Clean the bottom of the featured article
if ($element->find('div', -1))
$element->find('div', -1)->remove();
$element->find('div', -1)->outertext = '';
// The title and URI of the article can be found in an anchor containing
// the string '...' in most wikis ('full article ...')
@ -202,10 +202,10 @@ class WikipediaBridge extends BridgeAbstract {
// Let's remove a couple of things from the article
$table = $content->find('#toc', 0); // Table of contents
if(!$table === false)
$table->remove();
$table->outertext = '';
foreach($content->find('ol.references') as $reference) // References
$reference->remove();
$reference->outertext = '';
return str_replace('href="/', 'href="' . $this->getURI() . '/', $content->innertext);
}

View file

@ -50,7 +50,7 @@ class WordPressBridge extends FeedExpander {
foreach ($article->find('h1.entry-title') as $title)
if ($title->plaintext == $item['title'])
$title->remove();
$title->outertext = '';
$article_image = $article_html->find('img.wp-post-image', 0);
if(!empty($item['content']) && (!is_object($article_image) || empty($article_image->src))) {

View file

@ -44,7 +44,7 @@ class WorldOfTanksBridge extends FeedExpander {
// Remove the scripts, please
foreach($content->find('script') as $script) {
$script->remove();
$script->outertext = '';
}
return $content->innertext;

View file

@ -193,7 +193,7 @@ class XenForoBridge extends BridgeAbstract {
// Remove script tags
foreach($content->find('script') as $script) {
$script->remove();
$script->outertext = '';
}
$item['content'] = $content->innertext;

View file

@ -36,7 +36,7 @@ function sanitize($html,
if(in_array($element->tag, $text_to_keep)) {
$element->outertext = $element->plaintext;
} elseif(in_array($element->tag, $tags_to_remove)) {
$element->remove();
$element->outertext = '';
} else {
foreach($element->getAllAttributes() as $attributeName => $attribute) {
if(!in_array($attributeName, $attributes_to_keep))