[Multi] Minor improvements for my bridges (#1507)
* [DarkReading] Hide dummy articles * [FuturaSciences] Strip inline scripts from content * [FeedExpander] Fix PHP notice on missing uri field (guid is valid uri AND item uri is not valid) => (guid is valid uri AND item uri is empty or not valid) * [NextInpact] Fix subtitle extraction * [Markdown] Fix images with empty replacement text * [TheHackerNews] Fix Author name cleanup * [LeMondeInformatique] Remove encoding conversion Was previously needed due to actual encoding on the page being inconsistent with encoding specified in <meta> tag * [AnimeUltime] Remove encoding conversion Was previously needed due to encoding on the page being incorrect * [FuturaSciences] Fix content extraction * [FuturaSciences] Fix unneeded unset() * [GBAtemp] Fix tutorial mode URL extraction * [GBAtemp] Fix tutorial mode Title extraction
This commit is contained in:
commit
4b8c3b9d36
8 changed files with 12 additions and 9 deletions
|
@ -102,7 +102,6 @@ class AnimeUltimeBridge extends BridgeAbstract {
|
|||
$item_description = defaultLinkTo($item_description, self::URI);
|
||||
$item_description = str_replace("\r", '', $item_description);
|
||||
$item_description = str_replace("\n", '', $item_description);
|
||||
$item_description = utf8_encode($item_description);
|
||||
|
||||
//Build and add final item
|
||||
$item = array();
|
||||
|
|
|
@ -53,6 +53,8 @@ class DarkReadingBridge extends FeedExpander {
|
|||
|
||||
protected function parseItem($newsItem){
|
||||
$item = parent::parseItem($newsItem);
|
||||
if (empty($item['content']))
|
||||
return null; //ignore dummy articles
|
||||
$article = getSimpleHTMLDOMCached($item['uri'])
|
||||
or returnServerError('Could not request Dark Reading: ' . $item['uri']);
|
||||
$item['content'] = $this->extractArticleContent($article);
|
||||
|
|
|
@ -96,7 +96,7 @@ class FuturaSciencesBridge extends FeedExpander {
|
|||
}
|
||||
|
||||
private function extractArticleContent($article){
|
||||
$contents = $article->find('section.article-text-classic', 0)->innertext;
|
||||
$contents = $article->find('section.article-text', 1)->innertext;
|
||||
$headline = trim($article->find('p.description', 0)->plaintext);
|
||||
if(!empty($headline))
|
||||
$headline = '<p><b>' . $headline . '</b></p>';
|
||||
|
@ -129,6 +129,7 @@ class FuturaSciencesBridge extends FeedExpander {
|
|||
$contents = stripWithDelimiters($contents, 'fs:xt:clickname="', '"');
|
||||
$contents = StripWithDelimiters($contents, '<section class="module-toretain module-propal-nl', '</section>');
|
||||
$contents = stripWithDelimiters($contents, '<script ', '</script>');
|
||||
$contents = stripWithDelimiters($contents, '<script>', '</script>');
|
||||
|
||||
return $headline . trim($contents);
|
||||
}
|
||||
|
|
|
@ -113,8 +113,8 @@ class GBAtempBridge extends BridgeAbstract {
|
|||
break;
|
||||
case 'T':
|
||||
foreach($html->find('li.portal-tutorial') as $tutorialItem) {
|
||||
$url = self::URI . $tutorialItem->find('a', 0)->href;
|
||||
$title = $tutorialItem->find('a', 0)->plaintext;
|
||||
$url = self::URI . $tutorialItem->find('a', 1)->href;
|
||||
$title = $tutorialItem->find('a', 1)->plaintext;
|
||||
$time = $this->findItemDate($tutorialItem);
|
||||
$author = $tutorialItem->find('a.username', 0)->plaintext;
|
||||
$content = $this->fetchPostContent($url, self::URI);
|
||||
|
|
|
@ -26,8 +26,8 @@ class LeMondeInformatiqueBridge extends FeedExpander {
|
|||
|
||||
//No response header sets the encoding, explicit conversion is needed or subsequent xml_encode() will fail
|
||||
$content_node = $article_html->find('div.col-primary, div.col-sm-9', 0);
|
||||
$item['content'] = utf8_encode($this->cleanArticle($content_node->innertext));
|
||||
$item['author'] = utf8_encode($article_html->find('div.author-infos', 0)->find('b', 0)->plaintext);
|
||||
$item['content'] = $this->cleanArticle($content_node->innertext);
|
||||
$item['author'] = $article_html->find('div.author-infos', 0)->find('b', 0)->plaintext;
|
||||
|
||||
return $item;
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ class TheHackerNewsBridge extends BridgeAbstract {
|
|||
|
||||
$article_url = $element->find('a.story-link', 0)->href;
|
||||
$article_author = trim($element->find('i.icon-user', 0)->parent()->plaintext);
|
||||
$article_author = str_replace('', '', $article_author);
|
||||
$article_title = $element->find('h2.home-title', 0)->plaintext;
|
||||
|
||||
//Date without time
|
||||
|
|
|
@ -346,7 +346,7 @@ abstract class FeedExpander extends BridgeAbstract {
|
|||
if($attribute === 'isPermaLink'
|
||||
&& ($value === 'true' || (
|
||||
filter_var($feedItem->guid, FILTER_VALIDATE_URL)
|
||||
&& !filter_var($item['uri'], FILTER_VALIDATE_URL)
|
||||
&& (empty($item['uri']) || !filter_var($item['uri'], FILTER_VALIDATE_URL))
|
||||
)
|
||||
)
|
||||
) {
|
||||
|
|
|
@ -207,7 +207,7 @@ function markdownToHtml($string) {
|
|||
|
||||
//For more details about how these regex work:
|
||||
// https://github.com/RSS-Bridge/rss-bridge/pull/802#discussion_r216138702
|
||||
// Images: https://regex101.com/r/JW9Evr/1
|
||||
// Images: https://regex101.com/r/JW9Evr/2
|
||||
// Links: https://regex101.com/r/eRGVe7/1
|
||||
// Bold: https://regex101.com/r/2p40Y0/1
|
||||
// Italic: https://regex101.com/r/xJkET9/1
|
||||
|
@ -215,7 +215,7 @@ function markdownToHtml($string) {
|
|||
// Plain URL: https://regex101.com/r/2JHYwb/1
|
||||
// Site name: https://regex101.com/r/qIuKYE/1
|
||||
|
||||
$string = preg_replace('/\!\[([^\]]+)\]\(([^\) ]+)(?: [^\)]+)?\)/', '<img src="$2" alt="$1" />', $string);
|
||||
$string = preg_replace('/\!\[([^\]]*)\]\(([^\) ]+)(?: [^\)]+)?\)/', '<img src="$2" alt="$1" />', $string);
|
||||
$string = preg_replace('/\[([^\]]+)\]\(([^\)]+)\)/', '<a href="$2">$1</a>', $string);
|
||||
$string = preg_replace('/\*\*(.*)\*\*/U', '<b>$1</b>', $string);
|
||||
$string = preg_replace('/\*(.*)\*/U', '<i>$1</i>', $string);
|
||||
|
|
Loading…
Reference in a new issue