Revert "all: Use ->remove() instead of ->outertext = ''"
This reverts commit 052844f5e1
.
There is a bug in ->remove() that causes the parser to incorrectly
identify elements in the DOM tree that shouldn't exist anymore.
References #1151
This commit is contained in:
parent
468d8be72d
commit
6c4098d655
21 changed files with 49 additions and 44 deletions
|
@ -50,18 +50,18 @@ class AsahiShimbunAJWBridge extends BridgeAbstract {
|
|||
$e_lead = $element->find('span.Lead', 0);
|
||||
if ($e_lead) {
|
||||
$item['content'] = $e_lead->innertext;
|
||||
$e_lead->remove();
|
||||
$e_lead->outertext = '';
|
||||
} else {
|
||||
$item['content'] = $element->innertext;
|
||||
}
|
||||
$e_date = $element->find('span.EnDate', 0);
|
||||
if ($e_date) {
|
||||
$item['timestamp'] = strtotime($e_date->innertext);
|
||||
$e_date->remove();
|
||||
$e_date->outertext = '';
|
||||
}
|
||||
$e_video = $element->find('span.EnVideo', 0);
|
||||
if ($e_video) {
|
||||
$e_video->remove();
|
||||
$e_video->outertext = '';
|
||||
$element->innertext = "VIDEO: $element->innertext";
|
||||
}
|
||||
$item['title'] = $element->innertext;
|
||||
|
|
|
@ -55,7 +55,7 @@ class BundesbankBridge extends BridgeAbstract {
|
|||
$title = $study->find('.teasable__title div.h2', 0);
|
||||
|
||||
foreach($title->children as &$child) {
|
||||
$child->remove();
|
||||
$child->outertext = '';
|
||||
}
|
||||
|
||||
$item['title'] = $title->innertext;
|
||||
|
|
|
@ -58,7 +58,7 @@ class CastorusBridge extends BridgeAbstract {
|
|||
returnServerError('Cannot find nodes!');
|
||||
|
||||
foreach($nodes as $node) {
|
||||
$node->remove();
|
||||
$node->outertext = '';
|
||||
}
|
||||
|
||||
return strtotime($activity->innertext);
|
||||
|
|
|
@ -50,7 +50,7 @@ class DauphineLibereBridge extends FeedExpander {
|
|||
private function extractContent($url){
|
||||
$html2 = getSimpleHTMLDOMCached($url);
|
||||
foreach ($html2->find('.noprint, link, script, iframe, .shareTool, .contentInfo') as $remove) {
|
||||
$remove->remove();
|
||||
$remove->outertext = '';
|
||||
}
|
||||
return $html2->find('div.content', 0)->innertext;
|
||||
}
|
||||
|
|
|
@ -29,16 +29,16 @@ class EconomistBridge extends BridgeAbstract {
|
|||
// Remove newsletter subscription box
|
||||
$newsletter = $content->find('div[class="newsletter-form__message"]', 0);
|
||||
if ($newsletter)
|
||||
$newsletter->remove();
|
||||
$newsletter->outertext = '';
|
||||
|
||||
$newsletterForm = $content->find('form', 0);
|
||||
if ($newsletterForm)
|
||||
$newsletterForm->remove();
|
||||
$newsletterForm->outertext = '';
|
||||
|
||||
// Remove next and previous article URLs at the bottom
|
||||
$nextprev = $content->find('div[class="blog-post__next-previous-wrapper"]', 0);
|
||||
if ($nextprev)
|
||||
$nextprev->remove();
|
||||
$nextprev->outertext = '';
|
||||
|
||||
$section = [ $article->find('h3[itemprop="articleSection"]', 0)->plaintext ];
|
||||
|
||||
|
|
|
@ -584,7 +584,7 @@ EOD;
|
|||
|
||||
foreach($content_filters as $filter) {
|
||||
foreach($content->find($filter) as $subject) {
|
||||
$subject->remove();
|
||||
$subject->outertext = '';
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@ class HaveIBeenPwnedBridge extends BridgeAbstract {
|
|||
$permalink = $breach->find('p', 1)->find('a', 0)->href;
|
||||
|
||||
// Remove permalink
|
||||
$breach->find('p', 1)->find('a', 0)->remove();
|
||||
$breach->find('p', 1)->find('a', 0)->outertext = '';
|
||||
|
||||
$item['title'] = $breach->find('h3', 0)->plaintext . ' - ' . $accounts[1] . ' breached accounts';
|
||||
$item['dateAdded'] = strtotime($dateAdded[1]);
|
||||
|
|
|
@ -239,16 +239,16 @@ class JustETFBridge extends BridgeAbstract {
|
|||
or returnServerError('Article body not found!');
|
||||
|
||||
// Remove teaser image
|
||||
$element->find('img.teaser-img', 0)->remove();
|
||||
$element->find('img.teaser-img', 0)->outertext = '';
|
||||
|
||||
// Remove self advertisements
|
||||
foreach($element->find('.call-action') as $adv) {
|
||||
$adv->remove();
|
||||
$adv->outertext = '';
|
||||
}
|
||||
|
||||
// Remove tips
|
||||
foreach($element->find('.panel-edu') as $tip) {
|
||||
$tip->remove();
|
||||
$tip->outertext = '';
|
||||
}
|
||||
|
||||
// Remove inline scripts (used for i.e. interactive graphs) as they are
|
||||
|
@ -318,7 +318,7 @@ class JustETFBridge extends BridgeAbstract {
|
|||
$description = $description->parent();
|
||||
|
||||
foreach($description->find('div') as $div) {
|
||||
$div->remove();
|
||||
$div->outertext = '';
|
||||
}
|
||||
|
||||
$quote = $html->find('div.infobox div.val', 0)
|
||||
|
|
|
@ -61,7 +61,7 @@ class NextgovBridge extends FeedExpander {
|
|||
return 'Could not request Nextgov: ' . $url;
|
||||
|
||||
$contents = $article->find('div.wysiwyg', 0);
|
||||
$contents->find('svg.content-tombstone', 0)->remove();
|
||||
$contents->find('svg.content-tombstone', 0)->outertext = '';
|
||||
$contents = $contents->innertext;
|
||||
$contents = stripWithDelimiters($contents, '<div class="ad-container">', '</div>');
|
||||
$contents = stripWithDelimiters($contents, '<div', '</div>'); //ad outer div
|
||||
|
|
|
@ -51,7 +51,7 @@ class OsmAndBlogBridge extends BridgeAbstract {
|
|||
|
||||
private function cleanupContent($content, ...$removeItems) {
|
||||
foreach ($removeItems as $obj) {
|
||||
if ($obj) $obj->remove();
|
||||
if ($obj) $obj->outertext = '';
|
||||
}
|
||||
foreach ($content->find('img') as $obj) {
|
||||
$obj->src = $this->filterURL($obj->src);
|
||||
|
|
|
@ -83,7 +83,7 @@ class PikabuBridge extends BridgeAbstract {
|
|||
|
||||
foreach($el_to_remove_selectors as $el_to_remove_selector) {
|
||||
foreach($post->find($el_to_remove_selector) as $el) {
|
||||
$el->remove();
|
||||
$el->outertext = '';
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -38,17 +38,20 @@ class RadioMelodieBridge extends BridgeAbstract {
|
|||
$imgs = $textDOM->find('img[src^="http://www.radiomelodie.com/image.php]');
|
||||
foreach($imgs as $img) {
|
||||
$img->src = $this->rewriteImage($img->src);
|
||||
$article->save();
|
||||
}
|
||||
|
||||
// Remove Google Ads
|
||||
$ads = $article->find('div[class=adInline]');
|
||||
foreach($ads as $ad) {
|
||||
$ad->remove();
|
||||
$ad->outertext = '';
|
||||
$article->save();
|
||||
}
|
||||
|
||||
// Remove Radio Melodie Logo
|
||||
$logoHTML = $article->find('div[id=logoArticleRM]', 0);
|
||||
$logoHTML->remove();
|
||||
$logoHTML->outertext = '';
|
||||
$article->save();
|
||||
|
||||
$author = $article->find('p[class=AuthorName]', 0)->plaintext;
|
||||
|
||||
|
@ -62,7 +65,8 @@ class RadioMelodieBridge extends BridgeAbstract {
|
|||
$header = '<img src="' . $picture[0] . '"/>';
|
||||
|
||||
// Remove the Date and Author part
|
||||
$textDOM->find('div[class=AuthorDate]', 0)->remove();
|
||||
$textDOM->find('div[class=AuthorDate]', 0)->outertext = '';
|
||||
$article->save();
|
||||
$text = $textDOM->innertext;
|
||||
$item['content'] = '<h1>' . $item['title'] . '</h1>' . $date . '<br/>' . $header . $text;
|
||||
$this->items[] = $item;
|
||||
|
|
|
@ -48,7 +48,7 @@ class SIMARBridge extends BridgeAbstract {
|
|||
foreach($e_item->find('p') as $paragraph) {
|
||||
/* Remove empty paragraphs */
|
||||
if (preg_match('/^(\W| )+$/', $paragraph->innertext) == 1) {
|
||||
$paragraph->remove();
|
||||
$paragraph->outertext = '';
|
||||
}
|
||||
}
|
||||
if ($e_item) {
|
||||
|
|
|
@ -18,7 +18,7 @@ class ScmbBridge extends BridgeAbstract {
|
|||
$item['title'] = $article->find('header h1 a', 0)->innertext;
|
||||
|
||||
// remove text "En savoir plus" from anecdote content
|
||||
$article->find('span.read-more', 0)->remove();
|
||||
$article->find('span.read-more', 0)->outertext = '';
|
||||
$content = $article->find('p.summary a', 0)->innertext;
|
||||
|
||||
// remove superfluous spaces at the end
|
||||
|
|
|
@ -171,7 +171,7 @@ class TwitterBridge extends BridgeAbstract {
|
|||
|
||||
// remove 'invisible' content
|
||||
foreach($tweet->find('.invisible') as $invisible) {
|
||||
$invisible->remove();
|
||||
$invisible->outertext = '';
|
||||
}
|
||||
|
||||
// Skip protmoted tweets
|
||||
|
|
|
@ -62,8 +62,9 @@ class VkBridge extends BridgeAbstract
|
|||
$this->pageName = htmlspecialchars_decode($pageName);
|
||||
}
|
||||
foreach ($html->find('div.replies') as $comment_block) {
|
||||
$comment_block->remove();
|
||||
$comment_block->outertext = '';
|
||||
}
|
||||
$html->load($html->save());
|
||||
|
||||
$pinned_post_item = null;
|
||||
$last_post_id = 0;
|
||||
|
@ -81,7 +82,7 @@ class VkBridge extends BridgeAbstract
|
|||
|
||||
if (is_object($post->find('a.wall_post_more', 0))) {
|
||||
//delete link "show full" in content
|
||||
$post->find('a.wall_post_more', 0)->remove();
|
||||
$post->find('a.wall_post_more', 0)->outertext = '';
|
||||
}
|
||||
|
||||
$content_suffix = '';
|
||||
|
@ -113,7 +114,7 @@ class VkBridge extends BridgeAbstract
|
|||
|
||||
foreach($external_link_selectors_to_remove as $sel) {
|
||||
if (is_object($post->find($sel, 0))) {
|
||||
$post->find($sel, 0)->remove();
|
||||
$post->find($sel, 0)->outertext = '';
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -139,7 +140,7 @@ class VkBridge extends BridgeAbstract
|
|||
$content_suffix .= "<br><img src='" . $matches[1] . "'>";
|
||||
}
|
||||
$content_suffix .= "<br>Article: <a href='$article_link'>$article_title ($article_author)</a>";
|
||||
$article->remove();
|
||||
$article->outertext = '';
|
||||
}
|
||||
|
||||
// get video on post
|
||||
|
@ -149,7 +150,7 @@ class VkBridge extends BridgeAbstract
|
|||
$video_title = $video->find('div.post_video_title', 0)->plaintext;
|
||||
$video_link = $video->find('a.lnk', 0)->getAttribute('href');
|
||||
$this->appendVideo($video_title, $video_link, $content_suffix, $post_videos);
|
||||
$video->remove();
|
||||
$video->outertext = '';
|
||||
$main_video_link = $video_link;
|
||||
}
|
||||
|
||||
|
@ -160,14 +161,14 @@ class VkBridge extends BridgeAbstract
|
|||
if (count($temp) > 1) $video_title = $temp[1];
|
||||
$video_link = $a->getAttribute('href');
|
||||
if ($video_link != $main_video_link) $this->appendVideo($video_title, $video_link, $content_suffix, $post_videos);
|
||||
$a->remove();
|
||||
$a->outertext = '';
|
||||
}
|
||||
|
||||
// get all photos
|
||||
foreach($post->find('div.wall_text > a.page_post_thumb_wrap') as $a) {
|
||||
$result = $this->getPhoto($a);
|
||||
if ($result == null) continue;
|
||||
$a->remove();
|
||||
$a->outertext = '';
|
||||
$content_suffix .= "<br>$result";
|
||||
}
|
||||
|
||||
|
@ -176,7 +177,7 @@ class VkBridge extends BridgeAbstract
|
|||
$a = $el->find('.page_album_link', 0);
|
||||
$album_title = $a->find('.page_album_title_text', 0)->getAttribute('title');
|
||||
$album_link = $a->getAttribute('href');
|
||||
$el->remove();
|
||||
$el->outertext = '';
|
||||
$content_suffix .= "<br>Album: <a href='$album_link'>$album_title</a>";
|
||||
}
|
||||
|
||||
|
@ -199,7 +200,7 @@ class VkBridge extends BridgeAbstract
|
|||
|
||||
}
|
||||
|
||||
$a->remove();
|
||||
$a->outertext = '';
|
||||
}
|
||||
|
||||
// get other documents
|
||||
|
@ -216,7 +217,7 @@ class VkBridge extends BridgeAbstract
|
|||
|
||||
}
|
||||
|
||||
$div->remove();
|
||||
$div->outertext = '';
|
||||
}
|
||||
|
||||
// get polls
|
||||
|
@ -226,14 +227,14 @@ class VkBridge extends BridgeAbstract
|
|||
foreach($div->find('div.page_poll_text') as $poll_stat_title) {
|
||||
$content_suffix .= '<br>- ' . $poll_stat_title->innertext;
|
||||
}
|
||||
$div->remove();
|
||||
$div->outertext = '';
|
||||
}
|
||||
|
||||
// get sign
|
||||
$post_author = $pageName;
|
||||
foreach($post->find('a.wall_signed_by') as $a) {
|
||||
$post_author = $a->innertext;
|
||||
$a->remove();
|
||||
$a->outertext = '';
|
||||
}
|
||||
|
||||
if (is_object($post->find('div.copy_quote', 0))) {
|
||||
|
@ -242,7 +243,7 @@ class VkBridge extends BridgeAbstract
|
|||
}
|
||||
$copy_quote = $post->find('div.copy_quote', 0);
|
||||
if ($copy_post_header = $copy_quote->find('div.copy_post_header', 0)) {
|
||||
$copy_post_header->remove();
|
||||
$copy_post_header->outertext = '';
|
||||
}
|
||||
$copy_quote_content = $copy_quote->innertext;
|
||||
$copy_quote->outertext = "<br>Reposted: <br>$copy_quote_content";
|
||||
|
|
|
@ -141,7 +141,7 @@ class WikipediaBridge extends BridgeAbstract {
|
|||
$anchorFallbackIndex = 0){
|
||||
// Clean the bottom of the featured article
|
||||
if ($element->find('div', -1))
|
||||
$element->find('div', -1)->remove();
|
||||
$element->find('div', -1)->outertext = '';
|
||||
|
||||
// The title and URI of the article can be found in an anchor containing
|
||||
// the string '...' in most wikis ('full article ...')
|
||||
|
@ -202,10 +202,10 @@ class WikipediaBridge extends BridgeAbstract {
|
|||
// Let's remove a couple of things from the article
|
||||
$table = $content->find('#toc', 0); // Table of contents
|
||||
if(!$table === false)
|
||||
$table->remove();
|
||||
$table->outertext = '';
|
||||
|
||||
foreach($content->find('ol.references') as $reference) // References
|
||||
$reference->remove();
|
||||
$reference->outertext = '';
|
||||
|
||||
return str_replace('href="/', 'href="' . $this->getURI() . '/', $content->innertext);
|
||||
}
|
||||
|
|
|
@ -50,7 +50,7 @@ class WordPressBridge extends FeedExpander {
|
|||
|
||||
foreach ($article->find('h1.entry-title') as $title)
|
||||
if ($title->plaintext == $item['title'])
|
||||
$title->remove();
|
||||
$title->outertext = '';
|
||||
|
||||
$article_image = $article_html->find('img.wp-post-image', 0);
|
||||
if(!empty($item['content']) && (!is_object($article_image) || empty($article_image->src))) {
|
||||
|
|
|
@ -44,7 +44,7 @@ class WorldOfTanksBridge extends FeedExpander {
|
|||
|
||||
// Remove the scripts, please
|
||||
foreach($content->find('script') as $script) {
|
||||
$script->remove();
|
||||
$script->outertext = '';
|
||||
}
|
||||
|
||||
return $content->innertext;
|
||||
|
|
|
@ -193,7 +193,7 @@ class XenForoBridge extends BridgeAbstract {
|
|||
|
||||
// Remove script tags
|
||||
foreach($content->find('script') as $script) {
|
||||
$script->remove();
|
||||
$script->outertext = '';
|
||||
}
|
||||
|
||||
$item['content'] = $content->innertext;
|
||||
|
|
|
@ -36,7 +36,7 @@ function sanitize($html,
|
|||
if(in_array($element->tag, $text_to_keep)) {
|
||||
$element->outertext = $element->plaintext;
|
||||
} elseif(in_array($element->tag, $tags_to_remove)) {
|
||||
$element->remove();
|
||||
$element->outertext = '';
|
||||
} else {
|
||||
foreach($element->getAllAttributes() as $attributeName => $attribute) {
|
||||
if(!in_array($attributeName, $attributes_to_keep))
|
||||
|
|
Loading…
Reference in a new issue