From 6c4098d6558c33a5fcb2a8bc9fb29e915d56fc6c Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sun, 2 Jun 2019 13:03:26 +0200 Subject: [PATCH] Revert "all: Use ->remove() instead of ->outertext = ''" This reverts commit 052844f5e13c71ceefd743136a71f71226a0eefb. There is a bug in ->remove() that causes the parser to incorrectly identify elements in the DOM tree that shouldn't exist anymore. References #1151 --- bridges/AsahiShimbunAJWBridge.php | 6 +++--- bridges/BundesbankBridge.php | 2 +- bridges/CastorusBridge.php | 2 +- bridges/DauphineLibereBridge.php | 2 +- bridges/EconomistBridge.php | 6 +++--- bridges/FacebookBridge.php | 2 +- bridges/HaveIBeenPwnedBridge.php | 2 +- bridges/JustETFBridge.php | 8 ++++---- bridges/NextgovBridge.php | 2 +- bridges/OsmAndBlogBridge.php | 2 +- bridges/PikabuBridge.php | 2 +- bridges/RadioMelodieBridge.php | 10 +++++++--- bridges/SIMARBridge.php | 2 +- bridges/ScmbBridge.php | 2 +- bridges/TwitterBridge.php | 2 +- bridges/VkBridge.php | 27 ++++++++++++++------------- bridges/WikipediaBridge.php | 6 +++--- bridges/WordPressBridge.php | 2 +- bridges/WorldOfTanksBridge.php | 2 +- bridges/XenForoBridge.php | 2 +- lib/html.php | 2 +- 21 files changed, 49 insertions(+), 44 deletions(-) diff --git a/bridges/AsahiShimbunAJWBridge.php b/bridges/AsahiShimbunAJWBridge.php index 62b9739d..0ceb0381 100644 --- a/bridges/AsahiShimbunAJWBridge.php +++ b/bridges/AsahiShimbunAJWBridge.php @@ -50,18 +50,18 @@ class AsahiShimbunAJWBridge extends BridgeAbstract { $e_lead = $element->find('span.Lead', 0); if ($e_lead) { $item['content'] = $e_lead->innertext; - $e_lead->remove(); + $e_lead->outertext = ''; } else { $item['content'] = $element->innertext; } $e_date = $element->find('span.EnDate', 0); if ($e_date) { $item['timestamp'] = strtotime($e_date->innertext); - $e_date->remove(); + $e_date->outertext = ''; } $e_video = $element->find('span.EnVideo', 0); if ($e_video) { - $e_video->remove(); + $e_video->outertext = ''; $element->innertext = "VIDEO: $element->innertext"; } $item['title'] = $element->innertext; diff --git a/bridges/BundesbankBridge.php b/bridges/BundesbankBridge.php index d78873c6..b64a6425 100644 --- a/bridges/BundesbankBridge.php +++ b/bridges/BundesbankBridge.php @@ -55,7 +55,7 @@ class BundesbankBridge extends BridgeAbstract { $title = $study->find('.teasable__title div.h2', 0); foreach($title->children as &$child) { - $child->remove(); + $child->outertext = ''; } $item['title'] = $title->innertext; diff --git a/bridges/CastorusBridge.php b/bridges/CastorusBridge.php index 48af9696..3ed1331e 100644 --- a/bridges/CastorusBridge.php +++ b/bridges/CastorusBridge.php @@ -58,7 +58,7 @@ class CastorusBridge extends BridgeAbstract { returnServerError('Cannot find nodes!'); foreach($nodes as $node) { - $node->remove(); + $node->outertext = ''; } return strtotime($activity->innertext); diff --git a/bridges/DauphineLibereBridge.php b/bridges/DauphineLibereBridge.php index 1ff25106..20c82070 100644 --- a/bridges/DauphineLibereBridge.php +++ b/bridges/DauphineLibereBridge.php @@ -50,7 +50,7 @@ class DauphineLibereBridge extends FeedExpander { private function extractContent($url){ $html2 = getSimpleHTMLDOMCached($url); foreach ($html2->find('.noprint, link, script, iframe, .shareTool, .contentInfo') as $remove) { - $remove->remove(); + $remove->outertext = ''; } return $html2->find('div.content', 0)->innertext; } diff --git a/bridges/EconomistBridge.php b/bridges/EconomistBridge.php index 19b2a832..1256be45 100644 --- a/bridges/EconomistBridge.php +++ b/bridges/EconomistBridge.php @@ -29,16 +29,16 @@ class EconomistBridge extends BridgeAbstract { // Remove newsletter subscription box $newsletter = $content->find('div[class="newsletter-form__message"]', 0); if ($newsletter) - $newsletter->remove(); + $newsletter->outertext = ''; $newsletterForm = $content->find('form', 0); if ($newsletterForm) - $newsletterForm->remove(); + $newsletterForm->outertext = ''; // Remove next and previous article URLs at the bottom $nextprev = $content->find('div[class="blog-post__next-previous-wrapper"]', 0); if ($nextprev) - $nextprev->remove(); + $nextprev->outertext = ''; $section = [ $article->find('h3[itemprop="articleSection"]', 0)->plaintext ]; diff --git a/bridges/FacebookBridge.php b/bridges/FacebookBridge.php index a0331da9..c0901072 100644 --- a/bridges/FacebookBridge.php +++ b/bridges/FacebookBridge.php @@ -584,7 +584,7 @@ EOD; foreach($content_filters as $filter) { foreach($content->find($filter) as $subject) { - $subject->remove(); + $subject->outertext = ''; } } diff --git a/bridges/HaveIBeenPwnedBridge.php b/bridges/HaveIBeenPwnedBridge.php index 8fac1e33..f256623a 100644 --- a/bridges/HaveIBeenPwnedBridge.php +++ b/bridges/HaveIBeenPwnedBridge.php @@ -50,7 +50,7 @@ class HaveIBeenPwnedBridge extends BridgeAbstract { $permalink = $breach->find('p', 1)->find('a', 0)->href; // Remove permalink - $breach->find('p', 1)->find('a', 0)->remove(); + $breach->find('p', 1)->find('a', 0)->outertext = ''; $item['title'] = $breach->find('h3', 0)->plaintext . ' - ' . $accounts[1] . ' breached accounts'; $item['dateAdded'] = strtotime($dateAdded[1]); diff --git a/bridges/JustETFBridge.php b/bridges/JustETFBridge.php index c9201e4b..8d5b3d5a 100644 --- a/bridges/JustETFBridge.php +++ b/bridges/JustETFBridge.php @@ -239,16 +239,16 @@ class JustETFBridge extends BridgeAbstract { or returnServerError('Article body not found!'); // Remove teaser image - $element->find('img.teaser-img', 0)->remove(); + $element->find('img.teaser-img', 0)->outertext = ''; // Remove self advertisements foreach($element->find('.call-action') as $adv) { - $adv->remove(); + $adv->outertext = ''; } // Remove tips foreach($element->find('.panel-edu') as $tip) { - $tip->remove(); + $tip->outertext = ''; } // Remove inline scripts (used for i.e. interactive graphs) as they are @@ -318,7 +318,7 @@ class JustETFBridge extends BridgeAbstract { $description = $description->parent(); foreach($description->find('div') as $div) { - $div->remove(); + $div->outertext = ''; } $quote = $html->find('div.infobox div.val', 0) diff --git a/bridges/NextgovBridge.php b/bridges/NextgovBridge.php index 5e393457..74bfc54a 100644 --- a/bridges/NextgovBridge.php +++ b/bridges/NextgovBridge.php @@ -61,7 +61,7 @@ class NextgovBridge extends FeedExpander { return 'Could not request Nextgov: ' . $url; $contents = $article->find('div.wysiwyg', 0); - $contents->find('svg.content-tombstone', 0)->remove(); + $contents->find('svg.content-tombstone', 0)->outertext = ''; $contents = $contents->innertext; $contents = stripWithDelimiters($contents, '
', '
'); $contents = stripWithDelimiters($contents, ''); //ad outer div diff --git a/bridges/OsmAndBlogBridge.php b/bridges/OsmAndBlogBridge.php index 25e765f5..402c0301 100644 --- a/bridges/OsmAndBlogBridge.php +++ b/bridges/OsmAndBlogBridge.php @@ -51,7 +51,7 @@ class OsmAndBlogBridge extends BridgeAbstract { private function cleanupContent($content, ...$removeItems) { foreach ($removeItems as $obj) { - if ($obj) $obj->remove(); + if ($obj) $obj->outertext = ''; } foreach ($content->find('img') as $obj) { $obj->src = $this->filterURL($obj->src); diff --git a/bridges/PikabuBridge.php b/bridges/PikabuBridge.php index 8573e6b6..362b87dc 100644 --- a/bridges/PikabuBridge.php +++ b/bridges/PikabuBridge.php @@ -83,7 +83,7 @@ class PikabuBridge extends BridgeAbstract { foreach($el_to_remove_selectors as $el_to_remove_selector) { foreach($post->find($el_to_remove_selector) as $el) { - $el->remove(); + $el->outertext = ''; } } diff --git a/bridges/RadioMelodieBridge.php b/bridges/RadioMelodieBridge.php index 8e2cf05d..fb5aca6e 100644 --- a/bridges/RadioMelodieBridge.php +++ b/bridges/RadioMelodieBridge.php @@ -38,17 +38,20 @@ class RadioMelodieBridge extends BridgeAbstract { $imgs = $textDOM->find('img[src^="http://www.radiomelodie.com/image.php]'); foreach($imgs as $img) { $img->src = $this->rewriteImage($img->src); + $article->save(); } // Remove Google Ads $ads = $article->find('div[class=adInline]'); foreach($ads as $ad) { - $ad->remove(); + $ad->outertext = ''; + $article->save(); } // Remove Radio Melodie Logo $logoHTML = $article->find('div[id=logoArticleRM]', 0); - $logoHTML->remove(); + $logoHTML->outertext = ''; + $article->save(); $author = $article->find('p[class=AuthorName]', 0)->plaintext; @@ -62,7 +65,8 @@ class RadioMelodieBridge extends BridgeAbstract { $header = ''; // Remove the Date and Author part - $textDOM->find('div[class=AuthorDate]', 0)->remove(); + $textDOM->find('div[class=AuthorDate]', 0)->outertext = ''; + $article->save(); $text = $textDOM->innertext; $item['content'] = '

' . $item['title'] . '

' . $date . '
' . $header . $text; $this->items[] = $item; diff --git a/bridges/SIMARBridge.php b/bridges/SIMARBridge.php index 41d517b4..1e446cf5 100644 --- a/bridges/SIMARBridge.php +++ b/bridges/SIMARBridge.php @@ -48,7 +48,7 @@ class SIMARBridge extends BridgeAbstract { foreach($e_item->find('p') as $paragraph) { /* Remove empty paragraphs */ if (preg_match('/^(\W| )+$/', $paragraph->innertext) == 1) { - $paragraph->remove(); + $paragraph->outertext = ''; } } if ($e_item) { diff --git a/bridges/ScmbBridge.php b/bridges/ScmbBridge.php index 65fbbf01..2107aa3d 100644 --- a/bridges/ScmbBridge.php +++ b/bridges/ScmbBridge.php @@ -18,7 +18,7 @@ class ScmbBridge extends BridgeAbstract { $item['title'] = $article->find('header h1 a', 0)->innertext; // remove text "En savoir plus" from anecdote content - $article->find('span.read-more', 0)->remove(); + $article->find('span.read-more', 0)->outertext = ''; $content = $article->find('p.summary a', 0)->innertext; // remove superfluous spaces at the end diff --git a/bridges/TwitterBridge.php b/bridges/TwitterBridge.php index f3ba39c1..b3b7bed4 100644 --- a/bridges/TwitterBridge.php +++ b/bridges/TwitterBridge.php @@ -171,7 +171,7 @@ class TwitterBridge extends BridgeAbstract { // remove 'invisible' content foreach($tweet->find('.invisible') as $invisible) { - $invisible->remove(); + $invisible->outertext = ''; } // Skip protmoted tweets diff --git a/bridges/VkBridge.php b/bridges/VkBridge.php index 5274180f..8653e7c9 100644 --- a/bridges/VkBridge.php +++ b/bridges/VkBridge.php @@ -62,8 +62,9 @@ class VkBridge extends BridgeAbstract $this->pageName = htmlspecialchars_decode($pageName); } foreach ($html->find('div.replies') as $comment_block) { - $comment_block->remove(); + $comment_block->outertext = ''; } + $html->load($html->save()); $pinned_post_item = null; $last_post_id = 0; @@ -81,7 +82,7 @@ class VkBridge extends BridgeAbstract if (is_object($post->find('a.wall_post_more', 0))) { //delete link "show full" in content - $post->find('a.wall_post_more', 0)->remove(); + $post->find('a.wall_post_more', 0)->outertext = ''; } $content_suffix = ''; @@ -113,7 +114,7 @@ class VkBridge extends BridgeAbstract foreach($external_link_selectors_to_remove as $sel) { if (is_object($post->find($sel, 0))) { - $post->find($sel, 0)->remove(); + $post->find($sel, 0)->outertext = ''; } } @@ -139,7 +140,7 @@ class VkBridge extends BridgeAbstract $content_suffix .= "
"; } $content_suffix .= "
Article: $article_title ($article_author)"; - $article->remove(); + $article->outertext = ''; } // get video on post @@ -149,7 +150,7 @@ class VkBridge extends BridgeAbstract $video_title = $video->find('div.post_video_title', 0)->plaintext; $video_link = $video->find('a.lnk', 0)->getAttribute('href'); $this->appendVideo($video_title, $video_link, $content_suffix, $post_videos); - $video->remove(); + $video->outertext = ''; $main_video_link = $video_link; } @@ -160,14 +161,14 @@ class VkBridge extends BridgeAbstract if (count($temp) > 1) $video_title = $temp[1]; $video_link = $a->getAttribute('href'); if ($video_link != $main_video_link) $this->appendVideo($video_title, $video_link, $content_suffix, $post_videos); - $a->remove(); + $a->outertext = ''; } // get all photos foreach($post->find('div.wall_text > a.page_post_thumb_wrap') as $a) { $result = $this->getPhoto($a); if ($result == null) continue; - $a->remove(); + $a->outertext = ''; $content_suffix .= "
$result"; } @@ -176,7 +177,7 @@ class VkBridge extends BridgeAbstract $a = $el->find('.page_album_link', 0); $album_title = $a->find('.page_album_title_text', 0)->getAttribute('title'); $album_link = $a->getAttribute('href'); - $el->remove(); + $el->outertext = ''; $content_suffix .= "
Album: $album_title"; } @@ -199,7 +200,7 @@ class VkBridge extends BridgeAbstract } - $a->remove(); + $a->outertext = ''; } // get other documents @@ -216,7 +217,7 @@ class VkBridge extends BridgeAbstract } - $div->remove(); + $div->outertext = ''; } // get polls @@ -226,14 +227,14 @@ class VkBridge extends BridgeAbstract foreach($div->find('div.page_poll_text') as $poll_stat_title) { $content_suffix .= '
- ' . $poll_stat_title->innertext; } - $div->remove(); + $div->outertext = ''; } // get sign $post_author = $pageName; foreach($post->find('a.wall_signed_by') as $a) { $post_author = $a->innertext; - $a->remove(); + $a->outertext = ''; } if (is_object($post->find('div.copy_quote', 0))) { @@ -242,7 +243,7 @@ class VkBridge extends BridgeAbstract } $copy_quote = $post->find('div.copy_quote', 0); if ($copy_post_header = $copy_quote->find('div.copy_post_header', 0)) { - $copy_post_header->remove(); + $copy_post_header->outertext = ''; } $copy_quote_content = $copy_quote->innertext; $copy_quote->outertext = "
Reposted:
$copy_quote_content"; diff --git a/bridges/WikipediaBridge.php b/bridges/WikipediaBridge.php index a53652dd..7ca763fc 100644 --- a/bridges/WikipediaBridge.php +++ b/bridges/WikipediaBridge.php @@ -141,7 +141,7 @@ class WikipediaBridge extends BridgeAbstract { $anchorFallbackIndex = 0){ // Clean the bottom of the featured article if ($element->find('div', -1)) - $element->find('div', -1)->remove(); + $element->find('div', -1)->outertext = ''; // The title and URI of the article can be found in an anchor containing // the string '...' in most wikis ('full article ...') @@ -202,10 +202,10 @@ class WikipediaBridge extends BridgeAbstract { // Let's remove a couple of things from the article $table = $content->find('#toc', 0); // Table of contents if(!$table === false) - $table->remove(); + $table->outertext = ''; foreach($content->find('ol.references') as $reference) // References - $reference->remove(); + $reference->outertext = ''; return str_replace('href="/', 'href="' . $this->getURI() . '/', $content->innertext); } diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 18045559..1589c723 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -50,7 +50,7 @@ class WordPressBridge extends FeedExpander { foreach ($article->find('h1.entry-title') as $title) if ($title->plaintext == $item['title']) - $title->remove(); + $title->outertext = ''; $article_image = $article_html->find('img.wp-post-image', 0); if(!empty($item['content']) && (!is_object($article_image) || empty($article_image->src))) { diff --git a/bridges/WorldOfTanksBridge.php b/bridges/WorldOfTanksBridge.php index a5fa0446..46dd588d 100644 --- a/bridges/WorldOfTanksBridge.php +++ b/bridges/WorldOfTanksBridge.php @@ -44,7 +44,7 @@ class WorldOfTanksBridge extends FeedExpander { // Remove the scripts, please foreach($content->find('script') as $script) { - $script->remove(); + $script->outertext = ''; } return $content->innertext; diff --git a/bridges/XenForoBridge.php b/bridges/XenForoBridge.php index dc3a1a5e..7bf1f15d 100644 --- a/bridges/XenForoBridge.php +++ b/bridges/XenForoBridge.php @@ -193,7 +193,7 @@ class XenForoBridge extends BridgeAbstract { // Remove script tags foreach($content->find('script') as $script) { - $script->remove(); + $script->outertext = ''; } $item['content'] = $content->innertext; diff --git a/lib/html.php b/lib/html.php index 49c77f04..13db97a4 100644 --- a/lib/html.php +++ b/lib/html.php @@ -36,7 +36,7 @@ function sanitize($html, if(in_array($element->tag, $text_to_keep)) { $element->outertext = $element->plaintext; } elseif(in_array($element->tag, $tags_to_remove)) { - $element->remove(); + $element->outertext = ''; } else { foreach($element->getAllAttributes() as $attributeName => $attribute) { if(!in_array($attributeName, $attributes_to_keep))