From de8cee6a1cc1c79a356d80b17d4318a86aeb9290 Mon Sep 17 00:00:00 2001 From: ORelio Date: Sun, 9 Sep 2018 21:20:13 +0200 Subject: [PATCH] Catching up | [Main] Debug mode, parse utils, MIME | [Bridges] Add/Improve 20 bridges (#802) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Debug mode improvements - Improve debug warning message - Restore error reporting in debug mode - Fix 'notice' messages for unset fields * Add parsing utility functions html.php - extractFromDelimiters - stripWithDelimiters - stripRecursiveHTMLSection - markdownToHtml (partial) bridges - remove now-duplicate functions - call functions from html.php instead * [Anidex] New bridge Anime torrent tracker * [Anime-Ultime] Restore thumbnail * [CNET] Recreate bridge Full rewrite as the previous one was broken * [Dilbert] Minor URI fix Use new self::URI property * [EstCeQuonMetEnProd] Fix content extraction Bridge was broken * [Facebook] Fix "SpSonsSoriSsés" label ... which was taking space in item title * [Futura-Sciences] Use HTTPS, More cleanup Use HTTPS as FS now offer HTTPS Clean additional useless HTML elements * [GBATemp] Multiple fixes - Fix categories: missing "break" statements - Restore thumbnail as enclosure - Fix date extraction - Fix user blog post extraction - Use getSimpleHTMLDOMCached * [JapanExpo] Fix bridge, HTTPS, thumbnails - Fix getSimpleHTMLDOMCached call - Upgrade to HTTPS as JE now offers HTTPS - Restore thumbnails as enclosures * [LeMondeInformatique] Fix bridge, HTTPS - Upgrade to HTTPS as LMI now offers HTTPS - Restore thumbnails using small images - Fix content extraction - Fix text encoding issue * [Nextgov] Fix content extraction - Restore thumbnail and use small image - Field extraction fixes * [NextInpact] Add categories and filtering by type - Offer all RSS feeds - Allow filtering by article type - Implement extraction for brief articles - Remove article limit, many brief articles are publied all at once * [NyaaTorrents] New bridge Anime torrent tracker * [Releases3DS] Cache content, restore thumbnail - Use getSimpleHTMLDOMCached - Restore thumbnail as enclosure * [TheHackerNews] Fix bridge - Fix content extraction including article body - Restore thumbnail as enclosure * [WeLiveSecurity] HTTPS, Fix content extraction - Upgrade to HTTPS as WLS now offers HTTPS - Fix content extraction including article body * [WordPress] Reduce timeout, more content selectors - Reduce timeout to use default one (1h) - Add new content selector (articleBody) - Find thumbnail and set as enclosure - Fix '); + $article_html = stripWithDelimiters($article_html, ''); + return $article_html; + } + + public function collectData() { + + // Retrieve and check user input + $topic = str_replace('-', '/', $this->getInput('topic')); + if (!empty($topic) && (substr_count($topic, '/') > 1 || !ctype_alpha(str_replace('/', '', $topic)))) + returnClientError('Invalid topic: ' . $topic); + + // Retrieve webpage + $pageUrl = self::URI . (empty($topic) ? 'news/' : $topic.'/'); + $html = getSimpleHTMLDOM($pageUrl) + or returnServerError('Could not request CNET: '.$pageUrl); + + // Process articles + foreach($html->find('div.assetBody, div.riverPost') as $element) { + + if(count($this->items) >= 10) { + break; + } + + $article_title = trim($element->find('h2, h3', 0)->plaintext); + $article_uri = self::URI . substr($element->find('a', 0)->href, 1); + $article_thumbnail = $element->parent()->find('img[src]', 0)->src; + $article_timestamp = strtotime($element->find('time.assetTime, div.timeAgo', 0)->plaintext); + $article_author = trim($element->find('a[rel=author], a.name', 0)->plaintext); + $article_content = '

' . trim($element->find('p.dek', 0)->plaintext) . '

'; + + if (is_null($article_thumbnail)) + $article_thumbnail = extractFromDelimiters($element->innertext, 'find('div.originalImage', 0); + if (empty($article_thumbnail)) + $article_thumbnail = $article_html->find('span.imageContainer', 0); + if (is_object($article_thumbnail)) + $article_thumbnail = $article_thumbnail->find('img', 0)->src; + + $article_content .= trim( + $this->cleanArticle( + extractFromDelimiters( + $article_html, 'items[] = $item; + } + } + } +} diff --git a/bridges/DilbertBridge.php b/bridges/DilbertBridge.php index 959a91a2..a84e5e87 100644 --- a/bridges/DilbertBridge.php +++ b/bridges/DilbertBridge.php @@ -9,8 +9,8 @@ class DilbertBridge extends BridgeAbstract { public function collectData(){ - $html = getSimpleHTMLDOM($this->getURI()) - or returnServerError('Could not request Dilbert: ' . $this->getURI()); + $html = getSimpleHTMLDOM(self::URI) + or returnServerError('Could not request Dilbert: ' . self::URI); foreach($html->find('section.comic-item') as $element) { diff --git a/bridges/EstCeQuonMetEnProdBridge.php b/bridges/EstCeQuonMetEnProdBridge.php index db9d1d5f..4439d694 100644 --- a/bridges/EstCeQuonMetEnProdBridge.php +++ b/bridges/EstCeQuonMetEnProdBridge.php @@ -7,19 +7,9 @@ class EstCeQuonMetEnProdBridge extends BridgeAbstract { const CACHE_TIMEOUT = 21600; // 6h const DESCRIPTION = 'Should we put a website in production today? (French)'; - public function collectData(){ - function extractFromDelimiters($string, $start, $end){ - if(strpos($string, $start) !== false) { - $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); - $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); - return $section_retrieved; - } - - return false; - } - - $html = getSimpleHTMLDOM($this->getURI()) - or returnServerError('Could not request EstCeQuonMetEnProd: ' . $this->getURI()); + public function collectData() { + $html = getSimpleHTMLDOM(self::URI) + or returnServerError('Could not request EstCeQuonMetEnProd: ' . self::URI); $item = array(); $item['uri'] = $this->getURI() . '#' . date('Y-m-d'); @@ -28,8 +18,8 @@ class EstCeQuonMetEnProdBridge extends BridgeAbstract { $item['timestamp'] = strtotime('today midnight'); $item['content'] = str_replace( 'src="/', - 'src="' . $this->getURI(), - trim(extractFromDelimiters($html->outertext, '', '

')) + 'src="' . self::URI, + trim(extractFromDelimiters($html->outertext, '', '