[Multi] Minor improvements for my bridges (#1507)

* [DarkReading] Hide dummy articles * [FuturaSciences] Strip inline scripts from content * [FeedExpander] Fix PHP notice on missing uri field (guid is valid uri AND item uri is not valid) => (guid is valid uri AND item uri is empty or not valid) * [NextInpact] Fix subtitle extraction * [Markdown] Fix images with empty replacement text * [TheHackerNews] Fix Author name cleanup * [LeMondeInformatique] Remove encoding conversion Was previously needed due to actual encoding on the page being inconsistent with encoding specified in <meta> tag * [AnimeUltime] Remove encoding conversion Was previously needed due to encoding on the page being incorrect * [FuturaSciences] Fix content extraction * [FuturaSciences] Fix unneeded unset() * [GBAtemp] Fix tutorial mode URL extraction * [GBAtemp] Fix tutorial mode Title extraction
2020-08-14 10:30:31 +05:00 · 2020-08-14 10:30:31 +05:00 · 4b8c3b9d36
commit 4b8c3b9d36
parent dc36b425cd c642652fea
8 changed files with 12 additions and 9 deletions
--- a/bridges/AnimeUltimeBridge.php
+++ b/bridges/AnimeUltimeBridge.php
@ -102,7 +102,6 @@ class AnimeUltimeBridge extends BridgeAbstract {
 							$item_description = defaultLinkTo($item_description, self::URI);
 							$item_description = str_replace("\r", '', $item_description);
 							$item_description = str_replace("\n", '', $item_description);
-							$item_description = utf8_encode($item_description);

 							//Build and add final item
 							$item = array();
--- a/bridges/DarkReadingBridge.php
+++ b/bridges/DarkReadingBridge.php
@ -53,6 +53,8 @@ class DarkReadingBridge extends FeedExpander {

 	protected function parseItem($newsItem){
 		$item = parent::parseItem($newsItem);
+		if (empty($item['content']))
+			return null; //ignore dummy articles
 		$article = getSimpleHTMLDOMCached($item['uri'])
 			or returnServerError('Could not request Dark Reading: ' . $item['uri']);
 		$item['content'] = $this->extractArticleContent($article);
--- a/bridges/FuturaSciencesBridge.php
+++ b/bridges/FuturaSciencesBridge.php
@ -96,7 +96,7 @@ class FuturaSciencesBridge extends FeedExpander {
 	}

 	private function extractArticleContent($article){
-		$contents = $article->find('section.article-text-classic', 0)->innertext;
+		$contents = $article->find('section.article-text', 1)->innertext;
 		$headline = trim($article->find('p.description', 0)->plaintext);
 		if(!empty($headline))
 			$headline = '<p><b>' . $headline . '</b></p>';
@ -129,6 +129,7 @@ class FuturaSciencesBridge extends FeedExpander {
 		$contents = stripWithDelimiters($contents, 'fs:xt:clickname="', '"');
 		$contents = StripWithDelimiters($contents, '<section class="module-toretain module-propal-nl', '</section>');
 		$contents = stripWithDelimiters($contents, '<script ', '</script>');
+		$contents = stripWithDelimiters($contents, '<script>', '</script>');

 		return $headline . trim($contents);
 	}
--- a/bridges/GBAtempBridge.php
+++ b/bridges/GBAtempBridge.php
@ -113,8 +113,8 @@ class GBAtempBridge extends BridgeAbstract {
 			break;
 		case 'T':
 			foreach($html->find('li.portal-tutorial') as $tutorialItem) {
-				$url = self::URI . $tutorialItem->find('a', 0)->href;
-				$title = $tutorialItem->find('a', 0)->plaintext;
+				$url = self::URI . $tutorialItem->find('a', 1)->href;
+				$title = $tutorialItem->find('a', 1)->plaintext;
 				$time = $this->findItemDate($tutorialItem);
 				$author = $tutorialItem->find('a.username', 0)->plaintext;
 				$content = $this->fetchPostContent($url, self::URI);
--- a/bridges/LeMondeInformatiqueBridge.php
+++ b/bridges/LeMondeInformatiqueBridge.php
@ -26,8 +26,8 @@ class LeMondeInformatiqueBridge extends FeedExpander {

 		//No response header sets the encoding, explicit conversion is needed or subsequent xml_encode() will fail
 		$content_node = $article_html->find('div.col-primary, div.col-sm-9', 0);
-		$item['content'] = utf8_encode($this->cleanArticle($content_node->innertext));
-		$item['author'] = utf8_encode($article_html->find('div.author-infos', 0)->find('b', 0)->plaintext);
+		$item['content'] = $this->cleanArticle($content_node->innertext);
+		$item['author'] = $article_html->find('div.author-infos', 0)->find('b', 0)->plaintext;

 		return $item;
 	}
--- a/bridges/TheHackerNewsBridge.php
+++ b/bridges/TheHackerNewsBridge.php
@ -17,6 +17,7 @@ class TheHackerNewsBridge extends BridgeAbstract {

 				$article_url = $element->find('a.story-link', 0)->href;
 				$article_author = trim($element->find('i.icon-user', 0)->parent()->plaintext);
+				$article_author = str_replace('&#59396;', '', $article_author);
 				$article_title = $element->find('h2.home-title', 0)->plaintext;

 				//Date without time
--- a/lib/FeedExpander.php
+++ b/lib/FeedExpander.php
@ -346,7 +346,7 @@ abstract class FeedExpander extends BridgeAbstract {
 				if($attribute === 'isPermaLink'
 					&& ($value === 'true' || (
 							filter_var($feedItem->guid, FILTER_VALIDATE_URL)
-							&& !filter_var($item['uri'], FILTER_VALIDATE_URL)
+							&& (empty($item['uri']) || !filter_var($item['uri'], FILTER_VALIDATE_URL))
 						)
 					)
 				) {
--- a/lib/html.php
+++ b/lib/html.php
@ -207,7 +207,7 @@ function markdownToHtml($string) {

 	//For more details about how these regex work:
 	// https://github.com/RSS-Bridge/rss-bridge/pull/802#discussion_r216138702
-	// Images: https://regex101.com/r/JW9Evr/1
+	// Images: https://regex101.com/r/JW9Evr/2
 	// Links: https://regex101.com/r/eRGVe7/1
 	// Bold: https://regex101.com/r/2p40Y0/1
 	// Italic: https://regex101.com/r/xJkET9/1
@ -215,7 +215,7 @@ function markdownToHtml($string) {
 	// Plain URL: https://regex101.com/r/2JHYwb/1
 	// Site name: https://regex101.com/r/qIuKYE/1

-	$string = preg_replace('/\!\[([^\]]+)\]\(([^\) ]+)(?: [^\)]+)?\)/', '<img src="$2" alt="$1" />', $string);
+	$string = preg_replace('/\!\[([^\]]*)\]\(([^\) ]+)(?: [^\)]+)?\)/', '<img src="$2" alt="$1" />', $string);
 	$string = preg_replace('/\[([^\]]+)\]\(([^\)]+)\)/', '<a href="$2">$1</a>', $string);
 	$string = preg_replace('/\*\*(.*)\*\*/U', '<b>$1</b>', $string);
 	$string = preg_replace('/\*(.*)\*/U', '<i>$1</i>', $string);