[Multi] Minor improvements for my bridges (#1507)

* [DarkReading] Hide dummy articles

* [FuturaSciences] Strip inline scripts from content

* [FeedExpander] Fix PHP notice on missing uri field

(guid is valid uri AND item uri is not valid)
 => (guid is valid uri AND item uri is empty or not valid)

* [NextInpact] Fix subtitle extraction

* [Markdown] Fix images with empty replacement text

* [TheHackerNews] Fix Author name cleanup

* [LeMondeInformatique] Remove encoding conversion

Was previously needed due to actual encoding on the page
being inconsistent with encoding specified in <meta> tag

* [AnimeUltime] Remove encoding conversion

Was previously needed due to encoding on the page being incorrect

* [FuturaSciences] Fix content extraction

* [FuturaSciences] Fix unneeded unset()

* [GBAtemp] Fix tutorial mode URL extraction

* [GBAtemp] Fix tutorial mode Title extraction
This commit is contained in:
Eugene Molotov 2020-08-14 10:30:31 +05:00 committed by GitHub
commit 4b8c3b9d36
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 12 additions and 9 deletions

View File

@ -102,7 +102,6 @@ class AnimeUltimeBridge extends BridgeAbstract {
$item_description = defaultLinkTo($item_description, self::URI);
$item_description = str_replace("\r", '', $item_description);
$item_description = str_replace("\n", '', $item_description);
$item_description = utf8_encode($item_description);
//Build and add final item
$item = array();

View File

@ -53,6 +53,8 @@ class DarkReadingBridge extends FeedExpander {
protected function parseItem($newsItem){
$item = parent::parseItem($newsItem);
if (empty($item['content']))
return null; //ignore dummy articles
$article = getSimpleHTMLDOMCached($item['uri'])
or returnServerError('Could not request Dark Reading: ' . $item['uri']);
$item['content'] = $this->extractArticleContent($article);

View File

@ -96,7 +96,7 @@ class FuturaSciencesBridge extends FeedExpander {
}
private function extractArticleContent($article){
$contents = $article->find('section.article-text-classic', 0)->innertext;
$contents = $article->find('section.article-text', 1)->innertext;
$headline = trim($article->find('p.description', 0)->plaintext);
if(!empty($headline))
$headline = '<p><b>' . $headline . '</b></p>';
@ -129,6 +129,7 @@ class FuturaSciencesBridge extends FeedExpander {
$contents = stripWithDelimiters($contents, 'fs:xt:clickname="', '"');
$contents = StripWithDelimiters($contents, '<section class="module-toretain module-propal-nl', '</section>');
$contents = stripWithDelimiters($contents, '<script ', '</script>');
$contents = stripWithDelimiters($contents, '<script>', '</script>');
return $headline . trim($contents);
}

View File

@ -113,8 +113,8 @@ class GBAtempBridge extends BridgeAbstract {
break;
case 'T':
foreach($html->find('li.portal-tutorial') as $tutorialItem) {
$url = self::URI . $tutorialItem->find('a', 0)->href;
$title = $tutorialItem->find('a', 0)->plaintext;
$url = self::URI . $tutorialItem->find('a', 1)->href;
$title = $tutorialItem->find('a', 1)->plaintext;
$time = $this->findItemDate($tutorialItem);
$author = $tutorialItem->find('a.username', 0)->plaintext;
$content = $this->fetchPostContent($url, self::URI);

View File

@ -26,8 +26,8 @@ class LeMondeInformatiqueBridge extends FeedExpander {
//No response header sets the encoding, explicit conversion is needed or subsequent xml_encode() will fail
$content_node = $article_html->find('div.col-primary, div.col-sm-9', 0);
$item['content'] = utf8_encode($this->cleanArticle($content_node->innertext));
$item['author'] = utf8_encode($article_html->find('div.author-infos', 0)->find('b', 0)->plaintext);
$item['content'] = $this->cleanArticle($content_node->innertext);
$item['author'] = $article_html->find('div.author-infos', 0)->find('b', 0)->plaintext;
return $item;
}

View File

@ -17,6 +17,7 @@ class TheHackerNewsBridge extends BridgeAbstract {
$article_url = $element->find('a.story-link', 0)->href;
$article_author = trim($element->find('i.icon-user', 0)->parent()->plaintext);
$article_author = str_replace('&#59396;', '', $article_author);
$article_title = $element->find('h2.home-title', 0)->plaintext;
//Date without time

View File

@ -346,7 +346,7 @@ abstract class FeedExpander extends BridgeAbstract {
if($attribute === 'isPermaLink'
&& ($value === 'true' || (
filter_var($feedItem->guid, FILTER_VALIDATE_URL)
&& !filter_var($item['uri'], FILTER_VALIDATE_URL)
&& (empty($item['uri']) || !filter_var($item['uri'], FILTER_VALIDATE_URL))
)
)
) {

View File

@ -207,7 +207,7 @@ function markdownToHtml($string) {
//For more details about how these regex work:
// https://github.com/RSS-Bridge/rss-bridge/pull/802#discussion_r216138702
// Images: https://regex101.com/r/JW9Evr/1
// Images: https://regex101.com/r/JW9Evr/2
// Links: https://regex101.com/r/eRGVe7/1
// Bold: https://regex101.com/r/2p40Y0/1
// Italic: https://regex101.com/r/xJkET9/1
@ -215,7 +215,7 @@ function markdownToHtml($string) {
// Plain URL: https://regex101.com/r/2JHYwb/1
// Site name: https://regex101.com/r/qIuKYE/1
$string = preg_replace('/\!\[([^\]]+)\]\(([^\) ]+)(?: [^\)]+)?\)/', '<img src="$2" alt="$1" />', $string);
$string = preg_replace('/\!\[([^\]]*)\]\(([^\) ]+)(?: [^\)]+)?\)/', '<img src="$2" alt="$1" />', $string);
$string = preg_replace('/\[([^\]]+)\]\(([^\)]+)\)/', '<a href="$2">$1</a>', $string);
$string = preg_replace('/\*\*(.*)\*\*/U', '<b>$1</b>', $string);
$string = preg_replace('/\*(.*)\*/U', '<i>$1</i>', $string);