[Multi] Minor improvements for my bridges (#1507)

* [DarkReading] Hide dummy articles

* [FuturaSciences] Strip inline scripts from content

* [FeedExpander] Fix PHP notice on missing uri field

(guid is valid uri AND item uri is not valid)
 => (guid is valid uri AND item uri is empty or not valid)

* [NextInpact] Fix subtitle extraction

* [Markdown] Fix images with empty replacement text

* [TheHackerNews] Fix Author name cleanup

* [LeMondeInformatique] Remove encoding conversion

Was previously needed due to actual encoding on the page
being inconsistent with encoding specified in <meta> tag

* [AnimeUltime] Remove encoding conversion

Was previously needed due to encoding on the page being incorrect

* [FuturaSciences] Fix content extraction

* [FuturaSciences] Fix unneeded unset()

* [GBAtemp] Fix tutorial mode URL extraction

* [GBAtemp] Fix tutorial mode Title extraction
This commit is contained in:
Eugene Molotov 2020-08-14 10:30:31 +05:00 committed by GitHub
commit 4b8c3b9d36
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 12 additions and 9 deletions

View file

@ -102,7 +102,6 @@ class AnimeUltimeBridge extends BridgeAbstract {
$item_description = defaultLinkTo($item_description, self::URI); $item_description = defaultLinkTo($item_description, self::URI);
$item_description = str_replace("\r", '', $item_description); $item_description = str_replace("\r", '', $item_description);
$item_description = str_replace("\n", '', $item_description); $item_description = str_replace("\n", '', $item_description);
$item_description = utf8_encode($item_description);
//Build and add final item //Build and add final item
$item = array(); $item = array();

View file

@ -53,6 +53,8 @@ class DarkReadingBridge extends FeedExpander {
protected function parseItem($newsItem){ protected function parseItem($newsItem){
$item = parent::parseItem($newsItem); $item = parent::parseItem($newsItem);
if (empty($item['content']))
return null; //ignore dummy articles
$article = getSimpleHTMLDOMCached($item['uri']) $article = getSimpleHTMLDOMCached($item['uri'])
or returnServerError('Could not request Dark Reading: ' . $item['uri']); or returnServerError('Could not request Dark Reading: ' . $item['uri']);
$item['content'] = $this->extractArticleContent($article); $item['content'] = $this->extractArticleContent($article);

View file

@ -96,7 +96,7 @@ class FuturaSciencesBridge extends FeedExpander {
} }
private function extractArticleContent($article){ private function extractArticleContent($article){
$contents = $article->find('section.article-text-classic', 0)->innertext; $contents = $article->find('section.article-text', 1)->innertext;
$headline = trim($article->find('p.description', 0)->plaintext); $headline = trim($article->find('p.description', 0)->plaintext);
if(!empty($headline)) if(!empty($headline))
$headline = '<p><b>' . $headline . '</b></p>'; $headline = '<p><b>' . $headline . '</b></p>';
@ -129,6 +129,7 @@ class FuturaSciencesBridge extends FeedExpander {
$contents = stripWithDelimiters($contents, 'fs:xt:clickname="', '"'); $contents = stripWithDelimiters($contents, 'fs:xt:clickname="', '"');
$contents = StripWithDelimiters($contents, '<section class="module-toretain module-propal-nl', '</section>'); $contents = StripWithDelimiters($contents, '<section class="module-toretain module-propal-nl', '</section>');
$contents = stripWithDelimiters($contents, '<script ', '</script>'); $contents = stripWithDelimiters($contents, '<script ', '</script>');
$contents = stripWithDelimiters($contents, '<script>', '</script>');
return $headline . trim($contents); return $headline . trim($contents);
} }

View file

@ -113,8 +113,8 @@ class GBAtempBridge extends BridgeAbstract {
break; break;
case 'T': case 'T':
foreach($html->find('li.portal-tutorial') as $tutorialItem) { foreach($html->find('li.portal-tutorial') as $tutorialItem) {
$url = self::URI . $tutorialItem->find('a', 0)->href; $url = self::URI . $tutorialItem->find('a', 1)->href;
$title = $tutorialItem->find('a', 0)->plaintext; $title = $tutorialItem->find('a', 1)->plaintext;
$time = $this->findItemDate($tutorialItem); $time = $this->findItemDate($tutorialItem);
$author = $tutorialItem->find('a.username', 0)->plaintext; $author = $tutorialItem->find('a.username', 0)->plaintext;
$content = $this->fetchPostContent($url, self::URI); $content = $this->fetchPostContent($url, self::URI);

View file

@ -26,8 +26,8 @@ class LeMondeInformatiqueBridge extends FeedExpander {
//No response header sets the encoding, explicit conversion is needed or subsequent xml_encode() will fail //No response header sets the encoding, explicit conversion is needed or subsequent xml_encode() will fail
$content_node = $article_html->find('div.col-primary, div.col-sm-9', 0); $content_node = $article_html->find('div.col-primary, div.col-sm-9', 0);
$item['content'] = utf8_encode($this->cleanArticle($content_node->innertext)); $item['content'] = $this->cleanArticle($content_node->innertext);
$item['author'] = utf8_encode($article_html->find('div.author-infos', 0)->find('b', 0)->plaintext); $item['author'] = $article_html->find('div.author-infos', 0)->find('b', 0)->plaintext;
return $item; return $item;
} }

View file

@ -17,6 +17,7 @@ class TheHackerNewsBridge extends BridgeAbstract {
$article_url = $element->find('a.story-link', 0)->href; $article_url = $element->find('a.story-link', 0)->href;
$article_author = trim($element->find('i.icon-user', 0)->parent()->plaintext); $article_author = trim($element->find('i.icon-user', 0)->parent()->plaintext);
$article_author = str_replace('&#59396;', '', $article_author);
$article_title = $element->find('h2.home-title', 0)->plaintext; $article_title = $element->find('h2.home-title', 0)->plaintext;
//Date without time //Date without time

View file

@ -346,7 +346,7 @@ abstract class FeedExpander extends BridgeAbstract {
if($attribute === 'isPermaLink' if($attribute === 'isPermaLink'
&& ($value === 'true' || ( && ($value === 'true' || (
filter_var($feedItem->guid, FILTER_VALIDATE_URL) filter_var($feedItem->guid, FILTER_VALIDATE_URL)
&& !filter_var($item['uri'], FILTER_VALIDATE_URL) && (empty($item['uri']) || !filter_var($item['uri'], FILTER_VALIDATE_URL))
) )
) )
) { ) {

View file

@ -207,7 +207,7 @@ function markdownToHtml($string) {
//For more details about how these regex work: //For more details about how these regex work:
// https://github.com/RSS-Bridge/rss-bridge/pull/802#discussion_r216138702 // https://github.com/RSS-Bridge/rss-bridge/pull/802#discussion_r216138702
// Images: https://regex101.com/r/JW9Evr/1 // Images: https://regex101.com/r/JW9Evr/2
// Links: https://regex101.com/r/eRGVe7/1 // Links: https://regex101.com/r/eRGVe7/1
// Bold: https://regex101.com/r/2p40Y0/1 // Bold: https://regex101.com/r/2p40Y0/1
// Italic: https://regex101.com/r/xJkET9/1 // Italic: https://regex101.com/r/xJkET9/1
@ -215,7 +215,7 @@ function markdownToHtml($string) {
// Plain URL: https://regex101.com/r/2JHYwb/1 // Plain URL: https://regex101.com/r/2JHYwb/1
// Site name: https://regex101.com/r/qIuKYE/1 // Site name: https://regex101.com/r/qIuKYE/1
$string = preg_replace('/\!\[([^\]]+)\]\(([^\) ]+)(?: [^\)]+)?\)/', '<img src="$2" alt="$1" />', $string); $string = preg_replace('/\!\[([^\]]*)\]\(([^\) ]+)(?: [^\)]+)?\)/', '<img src="$2" alt="$1" />', $string);
$string = preg_replace('/\[([^\]]+)\]\(([^\)]+)\)/', '<a href="$2">$1</a>', $string); $string = preg_replace('/\[([^\]]+)\]\(([^\)]+)\)/', '<a href="$2">$1</a>', $string);
$string = preg_replace('/\*\*(.*)\*\*/U', '<b>$1</b>', $string); $string = preg_replace('/\*\*(.*)\*\*/U', '<b>$1</b>', $string);
$string = preg_replace('/\*(.*)\*/U', '<i>$1</i>', $string); $string = preg_replace('/\*(.*)\*/U', '<i>$1</i>', $string);