[GuardianBridge] - New bridge for the Guardian (#1249)

* [GuardianBridge] - New bridge for the Guardian
2019-08-28 19:57:45 +05:30 · 2019-08-28 19:57:45 +05:30 · f27b267614
commit f27b267614
parent 8bff63d9c6
1 changed files with 96 additions and 0 deletions
--- a/bridges/TheGuardianBridge.php
+++ b/bridges/TheGuardianBridge.php
@ -0,0 +1,96 @@
+<?php
+class TheGuardianBridge extends FeedExpander {
+	const MAINTAINER = 'IceWreck';
+	const NAME = 'The Guardian Bridge';
+	const URI = 'https://www.theguardian.com/';
+	const CACHE_TIMEOUT = 600; // This is a news site, so don't cache for more than 10 mins
+	const DESCRIPTION = 'RSS feed for The Guardian';
+	const PARAMETERS = array( array(
+		'feed' => array(
+			'name' => 'Feed',
+			'type' => 'list',
+			'values' => array(
+				'World News' => 'world/rss',
+				'US News' => '/us-news/rss',
+				'UK News' => '/uk-news/rss',
+				'Europe News' => '/world/europe-news/rss',
+				'Asia News' => '/world/asia/rss',
+				'Tech' => '/uk/technology/rss',
+				'Business News' => '/uk/business/rss',
+				'Opinion' => '/uk/commentisfree/rss',
+				'Lifestyle' => '/uk/lifeandstyle/rss',
+				'Culture' => '/uk/culture/rss',
+				'Sports' => '/uk/sport/rss'
+			)
+		)
+
+		/*
+
+		Topicwise Links
+
+		You can find the base feed for any topic by appending /rss to the url.
+
+		Example:
+
+		https://feeds.theguardian.com/theguardian/uk-news/rss
+		https://feeds.theguardian.com/theguardian/us-news/rss
+
+		Or simply
+
+		https://www.theguardian.com/world/rss
+
+		Just add that topic as a value in the PARAMETERS const.
+
+		*/
+
+
+	));
+
+	public function collectData(){
+		$feed = $this->getInput('feed');
+		$feedURL = 'https://feeds.theguardian.com/theguardian/' . $feed;
+		$this->collectExpandableDatas($feedURL, 10);
+	}
+
+	protected function parseItem($newsItem){
+		$item = parent::parseItem($newsItem);
+
+		// --- Recovering the article ---
+
+		// $articlePage gets the entire page's contents
+		$articlePage = getSimpleHTMLDOM($newsItem->link);
+		// figure contain's the main article image
+		$article = $articlePage->find('figure', 0);
+		// content__article-body has the actual article
+		foreach($articlePage->find('.content__article-body') as $element)
+			$article = $article . $element;
+
+		// --- Fixing ugly elements ---
+
+		// Replace the image viewer and BS with the image itself
+		foreach($articlePage->find('a.article__img-container') as $uslElementLoc) {
+			$main_img = $uslElementLoc->find('img', 0);
+			$article = str_replace($uslElementLoc, $main_img, $article);
+		}
+
+		// List of all the crap in the article
+		$uselessElements = array(
+			'#show-caption',
+			'.element-atom',
+			'.submeta',
+			'youtube-media-atom',
+			'svg'
+		);
+
+		// Remove the listed crap
+		foreach($uselessElements as $uslElement) {
+			foreach($articlePage->find($uslElement) as $uslElementLoc) {
+				$article = str_replace($uslElementLoc, '', $article);
+			}
+		}
+
+		$item['content'] = $article;
+
+		return $item;
+	}
+}