From 3d87ecbf8c75cc9bae2eccced1f6dcff75e9a818 Mon Sep 17 00:00:00 2001 From: Dreckiger-Dan Date: Wed, 15 May 2019 21:40:50 +0200 Subject: [PATCH 01/42] [.gitignore] Add robots.txt to the ignore list (#1128) --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index d970fed9..680260c7 100644 --- a/.gitignore +++ b/.gitignore @@ -240,3 +240,6 @@ config.ini.php #Auth .htaccess .htpasswd + +#Crawler +robots.txt From 16bd2aec7a69b98b4b55002a3ba6f54805906359 Mon Sep 17 00:00:00 2001 From: killruana Date: Wed, 15 May 2019 21:51:23 +0200 Subject: [PATCH 02/42] [MediapartBridge] Add new bridge (#1130) * If no cookie session is defined, use the default rss stream * Add a parameter for enabling/disabling the single page mode --- bridges/MediapartBridge.php | 60 +++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 bridges/MediapartBridge.php diff --git a/bridges/MediapartBridge.php b/bridges/MediapartBridge.php new file mode 100644 index 00000000..15d1d3ea --- /dev/null +++ b/bridges/MediapartBridge.php @@ -0,0 +1,60 @@ + array( + 'name' => 'Single page article', + 'type' => 'checkbox', + 'title' => 'Display long articles on a single page', + 'defaultValue' => 'checked' + ), + 'mpsessid' => array( + 'name' => 'MPSESSID', + 'type' => 'text', + 'title' => 'Value of the session cookie MPSESSID' + ) + ) + ); + const CACHE_TIMEOUT = 7200; // 2h + const DESCRIPTION = 'Returns the newest articles.'; + + public function collectData() { + $url = self::URI . 'articles/feed'; + $this->collectExpandableDatas($url); + } + + protected function parseItem($newsItem) { + $item = parent::parseItem($newsItem); + + // Enable single page mode? + if ($this->getInput('single_page_mode') === true) { + $item['uri'] .= '?onglet=full'; + } + + // If a session cookie is defined, get the full article + $mpsessid = $this->getInput('mpsessid'); + if (!empty($mpsessid)) { + // Set the session cookie + $opt = array(); + $opt[CURLOPT_COOKIE] = 'MPSESSID=' . $mpsessid; + + // Get the page + $articlePage = getSimpleHTMLDOM( + $newsItem->link . '?onglet=full', + array(), + $opt); + + // Extract the article content + $content = $articlePage->find('div.content-article', 0)->innertext; + $content = sanitize($content); + $content = defaultLinkTo($content, static::URI); + $item['content'] .= $content; + } + + return $item; + } +} From 7cf898b5afac65e1c32c427481a1731fee32c2bb Mon Sep 17 00:00:00 2001 From: Tobias Alexander Franke Date: Wed, 29 May 2019 20:50:04 +0000 Subject: [PATCH 03/42] [SteamCommunityBridge] Add new bridge (#1136) * [SteamCommunityBridge] Add new bridge --- bridges/SteamCommunityBridge.php | 127 +++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 bridges/SteamCommunityBridge.php diff --git a/bridges/SteamCommunityBridge.php b/bridges/SteamCommunityBridge.php new file mode 100644 index 00000000..56ea257c --- /dev/null +++ b/bridges/SteamCommunityBridge.php @@ -0,0 +1,127 @@ + array( + 'name' => 'App ID', + 'required' => true + ), + 'category' => array( + 'name' => 'category', + 'type' => 'list', + 'exampleValue' => 'Artwork', + 'title' => 'Select a category', + 'values' => array( + 'Artwork' => 'images', + 'Screenshots' => 'screenshots', + 'Videos' => 'videos' + ) + ) + ) + ); + + public function getIcon() { + return self::URI . '/favicon.ico'; + } + + protected function getMainPage() { + $category = $this->getInput('category'); + $html = getSimpleHTMLDOM($this->getURI() . '/?p=1&browsefilter=mostrecent') + or returnServerError('Could not fetch Steam data.'); + + return $html; + } + + public function getName() { + $category = $this->getInput('category'); + + if (is_null('i') || is_null($category)) { + return self::NAME; + } + + $html = $this->getMainPage(); + + $titleItem = $html->find('div.apphub_AppName', 0); + + if (!$titleItem) + return self::NAME; + + return $titleItem->innertext . ' (' . ucwords($category) . ')'; + } + + public function getURI() { + return self::URI . '/app/' + . $this->getInput('i') . '/' + . $this->getInput('category'); + } + + public function collectData() { + $category = $this->getInput('category'); + $html = $this->getMainPage(); + $cards = $html->find('div.apphub_Card'); + + foreach($cards as $card) { + $uri = $card->getAttribute('data-modal-content-url'); + + $htmlCard = getSimpleHTMLDOMCached($uri); + + $author = $card->find('div.apphub_CardContentAuthorName', 0)->innertext; + $author = strip_tags($author); + + $title = $author . '\'s screenshot'; + + if ($category != 'screenshots') + $title = $htmlCard->find('div.workshopItemTitle', 0)->innertext; + + $date = $htmlCard->find('div.detailsStatRight', 0)->innertext; + + // create item + $item = array(); + $item['title'] = $title; + $item['uri'] = $uri; + $item['timestamp'] = strtotime($date); + $item['author'] = $author; + $item['categories'] = $category; + + $media = $htmlCard->getElementById('ActualMedia'); + $mediaURI = $media->getAttribute('src'); + $downloadURI = $mediaURI; + + if ($category == 'videos') { + preg_match('/.*\/embed\/(.*)\?/', $mediaURI, $result); + $youtubeID = $result[1]; + $mediaURI = 'https://img.youtube.com/vi/' . $youtubeID . '/hqdefault.jpg'; + $downloadURI = 'https://www.youtube.com/watch?v=' . $youtubeID; + } + + $desc = ''; + + if ($category == 'screenshots') { + $descItem = $htmlCard->find('div.screenshotDescription', 0); + if ($descItem) + $desc = $descItem->innertext; + } + + if ($category == 'images') { + $descItem = $htmlCard->find('div.nonScreenshotDescription', 0); + if ($descItem) + $desc = $descItem->innertext; + $downloadURI = $htmlCard->find('a.downloadImage', 0)->href; + } + + $item['content'] = '

'; + $item['content'] .= '

' . $desc . '

'; + + $this->items[] = $item; + + if (count($this->items) >= 10) + break; + } + } +} From 84d48d5614a3c697e51bd4f65901f508a997c823 Mon Sep 17 00:00:00 2001 From: somini Date: Wed, 29 May 2019 21:51:52 +0100 Subject: [PATCH 04/42] [QPlayBridge]: New Bridge (#1118) * [QPlayBridge]: New Bridge --- bridges/QPlayBridge.php | 132 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 bridges/QPlayBridge.php diff --git a/bridges/QPlayBridge.php b/bridges/QPlayBridge.php new file mode 100644 index 00000000..f2043267 --- /dev/null +++ b/bridges/QPlayBridge.php @@ -0,0 +1,132 @@ + array( + 'program' => array( + 'name' => 'Program Name', + 'type' => 'text', + 'required' => true, + ), + ), + 'Catalog' => array( + 'all_pages' => array( + 'name' => 'All Pages', + 'type' => 'checkbox', + 'defaultValue' => false, + ), + ), + ); + + public function getIcon() { + # This should be the favicon served on `self::URI` + return 'https://s3.amazonaws.com/unode1/assets/4957/r3T9Lm9LTLmpAEX6FlSA_apple-touch-icon.png'; + } + + public function getURI() { + switch ($this->queriedContext) { + case 'Program': + return self::URI . '/programs/' . $this->getInput('program'); + case 'Catalog': + return self::URI . '/catalog'; + } + return parent::getURI(); + } + + public function getName() { + switch ($this->queriedContext) { + case 'Program': + $html = getSimpleHTMLDOMCached($this->getURI()) + or returnServerError('Could not load content'); + + return $html->find('h1.program--title', 0)->innertext; + case 'Catalog': + return self::NAME . ' | Programas'; + } + + return parent::getName(); + } + + /* This uses the uscreen platform, other sites can adapt this. https://www.uscreen.tv/ */ + public function collectData() { + switch ($this->queriedContext) { + case 'Program': + $program = $this->getInput('program'); + $html = getSimpleHTMLDOMCached($this->getURI()) + or returnServerError('Could not load content'); + + foreach($html->find('.cce--thumbnails-video-chapter') as $element) { + $cid = $element->getAttribute('data-id'); + $item['title'] = $element->find('.cce--chapter-title', 0)->innertext; + $item['content'] = $element->find('.cce--thumbnails-image-block', 0) + . $element->find('.cce--chapter-body', 0)->innertext; + $item['uri'] = $this->getURI() . '?cid=' . $cid; + + /* TODO: Suport login credentials? */ + /* # Get direct video URL */ + /* $json_source = getContents(self::URI . '/chapters/' . $cid, array('Cookie: _uscreen2_session=???;')) */ + /* or returnServerError('Could not request chapter JSON'); */ + /* $json = json_decode($json_source); */ + + /* $item['enclosures'] = [$json->fallback]; */ + + $this->items[] = $item; + } + + break; + case 'Catalog': + $json_raw = getContents($this->getCatalogURI(1)) + or returnServerError('Could not load catalog content'); + + $json = json_decode($json_raw); + $total_pages = $json->total_pages; + + foreach($this->parseCatalogPage($json) as $item) { + $this->items[] = $item; + } + + if ($this->getInput('all_pages') === true) { + foreach(range(2, $total_pages) as $page) { + $json_raw = getContents($this->getCatalogURI($page)) + or returnServerError('Could not load catalog content (all pages)'); + + $json = json_decode($json_raw); + + foreach($this->parseCatalogPage($json) as $item) { + $this->items[] = $item; + } + } + } + + break; + } + } + + private function getCatalogURI($page) { + return self::URI . '/catalog.json?page=' . $page; + } + + private function parseCatalogPage($json) { + $items = array(); + + foreach($json->records as $record) { + $item = array(); + + $item['title'] = $record->title; + $item['content'] = $record->description + . '
Duration: ' . $record->duration . '
'; + $item['timestamp'] = strtotime($record->release_date); + $item['uri'] = self::URI . $record->url; + $item['enclosures'] = array( + $record->main_poster, + ); + + $items[] = $item; + } + + return $items; + } +} From 4a60f05fd6517c3d90a6f66567428170d792cca0 Mon Sep 17 00:00:00 2001 From: Tobias Alexander Franke Date: Sat, 1 Jun 2019 09:18:30 +0000 Subject: [PATCH 05/42] [BinanceBridge] Add new bridge (#1135) --- bridges/BinanceBridge.php | 103 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 bridges/BinanceBridge.php diff --git a/bridges/BinanceBridge.php b/bridges/BinanceBridge.php new file mode 100644 index 00000000..9653ab73 --- /dev/null +++ b/bridges/BinanceBridge.php @@ -0,0 +1,103 @@ + array( + 'name' => 'category', + 'type' => 'list', + 'exampleValue' => 'Blog', + 'title' => 'Select a category', + 'values' => array( + 'Blog' => 'Blog', + 'Announcements' => 'Announcements' + ) + ) + )); + + public function getIcon() { + return 'https://bin.bnbstatic.com/static/images/common/favicon.ico'; + } + + public function getName() { + return self::NAME . ' ' . $this->getInput('category'); + } + + public function getURI() { + if ($this->getInput('category') == 'Blog') + return self::URI . '/en/blog'; + else + return 'https://binance.zendesk.com/hc/en-us/categories/115000056351-Announcements'; + } + + protected function collectBlogData() { + $html = getSimpleHTMLDOM($this->getURI()) + or returnServerError('Could not fetch Binance blog data.'); + + foreach($html->find('div[direction="row"]') as $element) { + + $date = $element->find('div[direction="column"]', 0); + $day = $date->find('div', 0)->innertext; + $month = $date->find('div', 1)->innertext; + $extractedDate = $day . ' ' . $month; + + $abstract = $element->find('div[direction="column"]', 1); + $a = $abstract->find('a', 0); + $uri = self::URI . $a->href; + $title = $a->innertext; + + $full = getSimpleHTMLDOMCached($uri); + $content = $full->find('div.desc', 1); + + $item = array(); + $item['title'] = $title; + $item['uri'] = $uri; + $item['timestamp'] = strtotime($extractedDate); + $item['author'] = 'Binance'; + $item['content'] = $content; + + $this->items[] = $item; + + if (count($this->items) >= 10) + break; + } + } + + protected function collectAnnouncementData() { + $html = getSimpleHTMLDOM($this->getURI()) + or returnServerError('Could not fetch Zendesk announcement data.'); + + foreach($html->find('a.article-list-link') as $a) { + $title = $a->innertext; + $uri = 'https://binance.zendesk.com' . $a->href; + + $full = getSimpleHTMLDOMCached($uri); + $content = $full->find('div.article-body', 0); + $date = $full->find('time', 0)->getAttribute('datetime'); + + $item = array(); + + $item['title'] = $title; + $item['uri'] = $uri; + $item['timestamp'] = strtotime($date); + $item['author'] = 'Binance'; + $item['content'] = $content; + + $this->items[] = $item; + + if (count($this->items) >= 10) + break; + } + } + + public function collectData() { + if ($this->getInput('category') == 'Blog') + $this->collectBlogData(); + else + $this->collectAnnouncementData(); + } +} From 72bcc173eb7d222375f1766ea98c682ca0fd242e Mon Sep 17 00:00:00 2001 From: Nemo Date: Sat, 1 Jun 2019 14:55:01 +0530 Subject: [PATCH 06/42] [Docker] Switch Docker Image to official php base image (#1140) * Switch Docker Image to official php base image Switch from the unofficial Alpine+php image to the official php-apache image. This has 2 advantages: 1. Official image is guaranteed to have regular updates, etc 2. The persistent Docker Alpine DNS Issue goes away; https://github.com/gliderlabs/docker-alpine/issues/255 * [Docker] Ignore more files from Docker Image --- .dockerignore | 11 +++++++++-- Dockerfile | 12 +++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/.dockerignore b/.dockerignore index f2bc0e8d..db313697 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,7 +1,14 @@ .git +.gitattributes +.github/* +.travis.yml cache/* +CONTRIBUTING.md DEBUG Dockerfile -whitelist.txt +phpcompatibility.xml phpcs.xml -CONTRIBUTING.md \ No newline at end of file +phpcs.xml +scalingo.json +tests/* +whitelist.txt \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 35caac84..7d0611be 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,11 @@ -FROM ulsmith/alpine-apache-php7 +FROM php:7-apache -COPY ./ /app/public/ +ENV APACHE_DOCUMENT_ROOT=/app -RUN chown -R apache:root /app/public \ No newline at end of file +RUN mv "$PHP_INI_DIR/php.ini-production" "$PHP_INI_DIR/php.ini" \ + && apt-get --yes update && apt-get --yes install libxml2-dev \ + && docker-php-ext-install -j$(nproc) simplexml \ + && sed -ri -e 's!/var/www/html!${APACHE_DOCUMENT_ROOT}!g' /etc/apache2/sites-available/*.conf \ + && sed -ri -e 's!/var/www/!${APACHE_DOCUMENT_ROOT}!g' /etc/apache2/apache2.conf /etc/apache2/conf-available/*.conf + +COPY --chown=www-data:www-data ./ /app/ \ No newline at end of file From 2a254855d8ea08a54e60bfcf0259e11ef061fa6e Mon Sep 17 00:00:00 2001 From: Joseph Date: Sat, 1 Jun 2019 10:06:58 +0000 Subject: [PATCH 07/42] [HaveIBeenPwnedBridge] Add new bridge (#1144) --- bridges/HaveIBeenPwnedBridge.php | 102 +++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 bridges/HaveIBeenPwnedBridge.php diff --git a/bridges/HaveIBeenPwnedBridge.php b/bridges/HaveIBeenPwnedBridge.php new file mode 100644 index 00000000..f256623a --- /dev/null +++ b/bridges/HaveIBeenPwnedBridge.php @@ -0,0 +1,102 @@ + array( + 'name' => 'Order by', + 'type' => 'list', + 'values' => array( + 'Breach date' => 'breachDate', + 'Date added to HIBP' => 'dateAdded', + ), + 'defaultValue' => 'dateAdded', + ) + )); + + const CACHE_TIMEOUT = 3600; + + private $breachDateRegex = '/Breach date: ([0-9]{1,2} [A-Z-a-z]+ [0-9]{4})/'; + private $dateAddedRegex = '/Date added to HIBP: ([0-9]{1,2} [A-Z-a-z]+ [0-9]{4})/'; + private $accountsRegex = '/Compromised accounts: ([0-9,]+)/'; + + private $breaches = array(); + + public function collectData() { + + $html = getSimpleHTMLDOM(self::URI . '/PwnedWebsites') + or returnServerError('Could not request: ' . self::URI . '/PwnedWebsites'); + + $breaches = array(); + + foreach($html->find('div.row') as $breach) { + $item = array(); + + if ($breach->class != 'row') { + continue; + } + + preg_match($this->breachDateRegex, $breach->find('p', 1)->plaintext, $breachDate) + or returnServerError('Could not extract details'); + + preg_match($this->dateAddedRegex, $breach->find('p', 1)->plaintext, $dateAdded) + or returnServerError('Could not extract details'); + + preg_match($this->accountsRegex, $breach->find('p', 1)->plaintext, $accounts) + or returnServerError('Could not extract details'); + + $permalink = $breach->find('p', 1)->find('a', 0)->href; + + // Remove permalink + $breach->find('p', 1)->find('a', 0)->outertext = ''; + + $item['title'] = $breach->find('h3', 0)->plaintext . ' - ' . $accounts[1] . ' breached accounts'; + $item['dateAdded'] = strtotime($dateAdded[1]); + $item['breachDate'] = strtotime($breachDate[1]); + $item['uri'] = self::URI . '/PwnedWebsites' . $permalink; + + $item['content'] = '

' . $breach->find('p', 0)->innertext . '

'; + $item['content'] .= '

' . $breach->find('p', 1)->innertext . '

'; + + $this->breaches[] = $item; + } + + $this->orderBreaches(); + $this->createItems(); + } + + /** + * Order Breaches by date added or date breached + */ + private function orderBreaches() { + + $sortBy = $this->getInput('order'); + $sort = array(); + + foreach ($this->breaches as $key => $item) { + $sort[$key] = $item[$sortBy]; + } + + array_multisort($sort, SORT_DESC, $this->breaches); + + } + + /** + * Create items from breaches array + */ + private function createItems() { + + foreach ($this->breaches as $breach) { + $item = array(); + + $item['title'] = $breach['title']; + $item['timestamp'] = $breach[$this->getInput('order')]; + $item['uri'] = $breach['uri']; + $item['content'] = $breach['content']; + + $this->items[] = $item; + } + } +} From b519d350bf81faf81d53a63035a37ed9a32e9fb6 Mon Sep 17 00:00:00 2001 From: sysadminstory Date: Sat, 1 Jun 2019 12:12:17 +0200 Subject: [PATCH 08/42] [RadioMelodieBridge] Fix bridge after website update (#1145) - The bridge has been adapted to the new website layout - The content now shows the header picture below the date --- bridges/RadioMelodieBridge.php | 47 +++++++++++++++++----------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/bridges/RadioMelodieBridge.php b/bridges/RadioMelodieBridge.php index 03a14a43..fb5aca6e 100644 --- a/bridges/RadioMelodieBridge.php +++ b/bridges/RadioMelodieBridge.php @@ -12,11 +12,12 @@ class RadioMelodieBridge extends BridgeAbstract { public function collectData(){ $html = getSimpleHTMLDOM(self::URI . '/actu/') or returnServerError('Could not request Radio Melodie.'); - $list = $html->find('div[class=actu_col1]', 0)->children();; + $list = $html->find('div[class=displayList]', 0)->children(); foreach($list as $element) { if($element->tag == 'a') { $articleURL = self::URI . $element->href; $article = getSimpleHTMLDOM($articleURL); + $textDOM = $article->find('article', 0); // Initialise arrays $item = array(); @@ -24,52 +25,50 @@ class RadioMelodieBridge extends BridgeAbstract { $picture = array(); // Get the Main picture URL - $picture[] = $this->rewriteImage($article->find('img[id=picturearticle]', 0)->src); - $audioHTML = $article->find('div[class=sm2-playlist-wrapper]'); + $picture[] = $this->rewriteImage($article->find('div[id=pictureTitleSupport]', 0)->find('img', 0)->src); + $audioHTML = $article->find('audio'); - // Remove the audio placeholder under the Audio player with an

' . $item['title'] . '

' . $date_category . $header . $text; + $item['content'] = '

' . $item['title'] . '

' . $date . '
' . $header . $text; $this->items[] = $item; } } @@ -81,7 +80,7 @@ class RadioMelodieBridge extends BridgeAbstract { private function rewriteImage($url) { $parts = explode('?', $url); - parse_str($parts[1], $params); + parse_str(html_entity_decode($parts[1]), $params); return self::URI . '/' . $params['image']; } From b889e867fd390f86ebf658cc2b31ca45baf37e6b Mon Sep 17 00:00:00 2001 From: Joseph Date: Sat, 1 Jun 2019 13:04:43 +0000 Subject: [PATCH 09/42] [SoundCloudBridge] Use account avatar as feed icon (#1146) --- bridges/SoundcloudBridge.php | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bridges/SoundcloudBridge.php b/bridges/SoundcloudBridge.php index 30958566..8938ff96 100644 --- a/bridges/SoundcloudBridge.php +++ b/bridges/SoundcloudBridge.php @@ -16,6 +16,8 @@ class SoundCloudBridge extends BridgeAbstract { const CLIENT_ID = 'W0KEWWILAjDiRH89X0jpwzuq6rbSK08R'; + private $feedIcon = null; + public function collectData(){ $res = json_decode(getContents( @@ -25,6 +27,8 @@ class SoundCloudBridge extends BridgeAbstract { . self::CLIENT_ID )) or returnServerError('No results for this query'); + $this->feedIcon = $res->avatar_url; + $tracks = json_decode(getContents( 'https://api.soundcloud.com/users/' . urlencode($res->id) @@ -56,6 +60,14 @@ class SoundCloudBridge extends BridgeAbstract { } + public function getIcon(){ + if ($this->feedIcon) { + return $this->feedIcon; + } + + return parent::getIcon(); + } + public function getName(){ if(!is_null($this->getInput('u'))) { return self::NAME . ' - ' . $this->getInput('u'); From 66c5b732cf21dff54908a2e3091ec9d8b0a2cc80 Mon Sep 17 00:00:00 2001 From: fulmeek <36341513+fulmeek@users.noreply.github.com> Date: Sat, 1 Jun 2019 19:36:46 +0200 Subject: [PATCH 10/42] [FeedItem] Avoid repeated UID hashing after loading from cache (#1148) This fixes the following issue: 1. bridge sets unique ids for the items (ids get hashed) 2. items go to the cache 3. on next run items get loaded from cache 4. these items have different ids because they were hashed again 5. they show up twice in feed reader --- lib/FeedItem.php | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/FeedItem.php b/lib/FeedItem.php index b0095be2..9a435730 100644 --- a/lib/FeedItem.php +++ b/lib/FeedItem.php @@ -418,6 +418,9 @@ class FeedItem { if(!is_string($uid)) { Debug::log('Unique id must be a string!'); + } elseif (preg_match('/^[a-f0-9]{40}$/', $uid)) { + // keep id if it already is a SHA-1 hash + $this->uid = $uid; } else { $this->uid = sha1($uid); } From 5656792cee3ddf4b38a60b6d05b5b08fa705a4c3 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 1 Jun 2019 19:45:28 +0200 Subject: [PATCH 11/42] [simplehtmldom] Update to version 1.9 Find the release notes at https://sourceforge.net/projects/simplehtmldom/files/simplehtmldom/1.9/ --- vendor/simplehtmldom/LICENSE | 21 + vendor/simplehtmldom/simple_html_dom.php | 1020 ++++++---------------- 2 files changed, 300 insertions(+), 741 deletions(-) create mode 100644 vendor/simplehtmldom/LICENSE diff --git a/vendor/simplehtmldom/LICENSE b/vendor/simplehtmldom/LICENSE new file mode 100644 index 00000000..6040f77b --- /dev/null +++ b/vendor/simplehtmldom/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 S.C. Chen, John Schlick, logmanoriginal + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/vendor/simplehtmldom/simple_html_dom.php b/vendor/simplehtmldom/simple_html_dom.php index c0001e3b..d30b018e 100644 --- a/vendor/simplehtmldom/simple_html_dom.php +++ b/vendor/simplehtmldom/simple_html_dom.php @@ -3,64 +3,24 @@ * Website: http://sourceforge.net/projects/simplehtmldom/ * Additional projects: http://sourceforge.net/projects/debugobject/ * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) - * Contributions by: - * Yousuke Kumakura (Attribute filters) - * Vadim Voituk (Negative indexes supports of "find" method) - * Antcs (Constructor with automatically load contents either text or file/url) - * - * all affected sections have comments starting with "PaperG" - * - * Paperg - Added case insensitive testing of the value of the selector. - * - * Paperg - Added tag_start for the starting index of tags - NOTE: This works - * but not accurately. This tag_start gets counted AFTER \r\n have been crushed - * out, and after the remove_noice calls so it will not reflect the REAL - * position of the tag in the source, it will almost always be smaller by some - * amount. We use this to determine how far into the file the tag in question - * is. This "percentage" will never be accurate as the $dom->size is the "real" - * number of bytes the dom was created from. But for most purposes, it's a - * really good estimation. - * - * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags - * closed is great for malformed html, but it CAN lead to parsing errors. - * - * Allow the user to tell us how much they trust the html. - * - * Paperg add the text and plaintext to the selectors for the find syntax. - * plaintext implies text in the innertext of a node. text implies that the - * tag is a text node. This allows for us to find tags based on the text they - * contain. - * - * Create find_ancestor_tag to see if a tag is - at any level - inside of - * another specific tag. - * - * Paperg: added parse_charset so that we know about the character set of - * the source document. NOTE: If the user's system has a routine called - * get_last_retrieve_url_contents_content_type availalbe, we will assume it's - * returning the content-type header from the last transfer or curl_exec, and - * we will parse that and use it in preference to any other method of charset - * detection. - * - * Found infinite loop in the case of broken html in restore_noise. Rewrote to - * protect from that. - * - * PaperG (John Schlick) Added get_display_size for "IMG" tags. * * Licensed under The MIT License - * Redistributions of files must retain the above copyright notice. + * See the LICENSE file in the project root for more information. * - * @author S.C. Chen - * @author John Schlick - * @author Rus Carroll - * @version Rev. 1.8.1 (247) - * @package PlaceLocalInclude - * @subpackage simple_html_dom + * Authors: + * S.C. Chen + * John Schlick + * Rus Carroll + * logmanoriginal + * + * Contributors: + * Yousuke Kumakura + * Vadim Voituk + * Antcs + * + * Version Rev. 1.9 (290) */ -/** - * All of the Defines for the classes below. - * @author S.C. Chen - */ define('HDOM_TYPE_ELEMENT', 1); define('HDOM_TYPE_COMMENT', 2); define('HDOM_TYPE_TEXT', 3); @@ -79,25 +39,12 @@ define('HDOM_INFO_INNER', 5); define('HDOM_INFO_OUTER', 6); define('HDOM_INFO_ENDSPACE', 7); -/** The default target charset */ defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); - -/** The default
text used instead of
tags when returning text */ defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); - -/** The default text used instead of tags when returning text */ defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); - -/** The maximum file size the parser should load */ defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); - -/** Contents between curly braces "{" and "}" are interpreted as text */ define('HDOM_SMARTY_AS_TEXT', 1); -// helper functions -// ----------------------------------------------------------------------------- -// get html dom from file -// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. function file_get_html( $url, $use_include_path = false, @@ -111,10 +58,8 @@ function file_get_html( $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) { - // Ensure maximum length is greater than zero if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } - // We DO force the tags to be terminated. $dom = new simple_html_dom( null, $lowercase, @@ -122,7 +67,8 @@ function file_get_html( $target_charset, $stripRN, $defaultBRText, - $defaultSpanText); + $defaultSpanText + ); /** * For sourceforge users: uncomment the next line and comment the @@ -133,19 +79,18 @@ function file_get_html( $use_include_path, $context, $offset, - $maxLen); - - // Paperg - use our own mechanism for getting the contents as we want to - // control the timeout. + $maxLen + ); // $contents = retrieve_url_contents($url); - if (empty($contents) || strlen($contents) > $maxLen) { return false; } - // The second parameter can force the selectors to all be lowercase. - $dom->load($contents, $lowercase, $stripRN); - return $dom; + if (empty($contents) || strlen($contents) > $maxLen) { + $dom->clear(); + return false; + } + + return $dom->load($contents, $lowercase, $stripRN); } -// get html dom from string function str_get_html( $str, $lowercase = true, @@ -162,97 +107,34 @@ function str_get_html( $target_charset, $stripRN, $defaultBRText, - $defaultSpanText); + $defaultSpanText + ); if (empty($str) || strlen($str) > MAX_FILE_SIZE) { $dom->clear(); return false; } - $dom->load($str, $lowercase, $stripRN); - return $dom; + return $dom->load($str, $lowercase, $stripRN); } -// dump html dom tree function dump_html_tree($node, $show_attr = true, $deep = 0) { $node->dump($node); } -/** - * simple html dom node - * PaperG - added ability for "find" routine to lowercase the value of the - * selector. - * - * PaperG - added $tag_start to track the start position of the tag in the total - * byte index - * - * @package PlaceLocalInclude - */ class simple_html_dom_node { - /** - * Node type - * - * Default is {@see HDOM_TYPE_TEXT} - * - * @var int - */ public $nodetype = HDOM_TYPE_TEXT; - - /** - * Tag name - * - * Default is 'text' - * - * @var string - */ public $tag = 'text'; - - /** - * List of attributes - * - * @var array - */ public $attr = array(); - - /** - * List of child node objects - * - * @var array - */ public $children = array(); public $nodes = array(); - - /** - * The parent node object - * - * @var object|null - */ public $parent = null; - - // The "info" array - see HDOM_INFO_... for what each element contains. public $_ = array(); - - /** - * Start position of the tag in the document - * - * @var int - */ public $tag_start = 0; - - /** - * The DOM object - * - * @var object|null - */ private $dom = null; - /** - * Construct new node object - * - * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes} - */ function __construct($dom) { $this->dom = $dom; @@ -269,7 +151,6 @@ class simple_html_dom_node return $this->outertext(); } - // clean up memory due to php5 circular references memory leak... function clear() { $this->dom = null; @@ -278,17 +159,14 @@ class simple_html_dom_node $this->children = null; } - // dump node's tree - function dump($show_attr = true, $deep = 0) + function dump($show_attr = true, $depth = 0) { - $lead = str_repeat(' ', $deep); - - echo $lead . $this->tag; + echo str_repeat("\t", $depth) . $this->tag; if ($show_attr && count($this->attr) > 0) { echo '('; foreach ($this->attr as $k => $v) { - echo "[$k]=>\"" . $this->$k . '", '; + echo "[$k]=>\"$v\", "; } echo ')'; } @@ -296,14 +174,12 @@ class simple_html_dom_node echo "\n"; if ($this->nodes) { - foreach ($this->nodes as $c) { - $c->dump($show_attr, $deep + 1); + foreach ($this->nodes as $node) { + $node->dump($show_attr, $depth + 1); } } } - - // Debugging function to dump a single dom node with a bunch of information about it. function dump_node($echo = true) { $string = $this->tag; @@ -311,7 +187,7 @@ class simple_html_dom_node if (count($this->attr) > 0) { $string .= '('; foreach ($this->attr as $k => $v) { - $string .= "[$k]=>\"" . $this->$k . '", '; + $string .= "[$k]=>\"$v\", "; } $string .= ')'; } @@ -322,24 +198,24 @@ class simple_html_dom_node if (is_array($v)) { $string .= "[$k]=>("; foreach ($v as $k2 => $v2) { - $string .= "[$k2]=>\"" . $v2 . '", '; + $string .= "[$k2]=>\"$v2\", "; } $string .= ')'; } else { - $string .= "[$k]=>\"" . $v . '", '; + $string .= "[$k]=>\"$v\", "; } } $string .= ')'; } if (isset($this->text)) { - $string .= ' text: (' . $this->text . ')'; + $string .= " text: ({$this->text})"; } - $string .= " HDOM_INNER_INFO: '"; + $string .= ' HDOM_INNER_INFO: '; if (isset($node->_[HDOM_INFO_INNER])) { - $string .= $node->_[HDOM_INFO_INNER] . "'"; + $string .= "'" . $node->_[HDOM_INFO_INNER] . "'"; } else { $string .= ' NULL '; } @@ -357,13 +233,6 @@ class simple_html_dom_node } } - /** - * Return or set parent node - * - * @param object|null $parent (optional) The parent node, `null` to return - * the current parent node. - * @return object|null The parent node - */ function parent($parent = null) { // I am SURE that this doesn't work properly. @@ -378,22 +247,11 @@ class simple_html_dom_node return $this->parent; } - /** - * @return bool True if the node has at least one child node - */ function has_child() { return !empty($this->children); } - /** - * Get child node at specified index - * - * @param int $idx The index of the child node to return, `-1` to return all - * child nodes. - * @return object|array|null The child node at the specified index, all child - * nodes or null if the index is invalid. - */ function children($idx = -1) { if ($idx === -1) { @@ -407,15 +265,6 @@ class simple_html_dom_node return null; } - /** - * Get first child node - * - * @return object|null The first child node or null if the current node has - * no child nodes. - * - * @todo Use `empty()` instead of `count()` to improve performance on large - * arrays. - */ function first_child() { if (count($this->children) > 0) { @@ -424,108 +273,70 @@ class simple_html_dom_node return null; } - /** - * Get last child node - * - * @return object|null The last child node or null if the current node has - * no child nodes. - * - * @todo Use `end()` to slightly improve performance on large arrays. - */ function last_child() { - if (($count = count($this->children)) > 0) { - return $this->children[$count - 1]; + if (count($this->children) > 0) { + return end($this->children); } return null; } - /** - * Get next sibling node - * - * @return object|null The sibling node or null if the current node has no - * sibling nodes. - */ function next_sibling() { if ($this->parent === null) { return null; } - $idx = 0; - $count = count($this->parent->children); + $idx = array_search($this, $this->parent->children, true); - while ($idx < $count && $this !== $this->parent->children[$idx]) { - ++$idx; + if ($idx !== false && isset($this->parent->children[$idx + 1])) { + return $this->parent->children[$idx + 1]; } - if (++$idx >= $count) { + return null; + } + + function prev_sibling() + { + if ($this->parent === null) { return null; } - return $this->parent->children[$idx]; - } + $idx = array_search($this, $this->parent->children, true); - /** - * Get previous sibling node - * - * @return object|null The sibling node or null if the current node has no - * sibling nodes. - */ - function prev_sibling() - { - if ($this->parent === null) { return null; } - - $idx = 0; - $count = count($this->parent->children); - - while ($idx < $count && $this !== $this->parent->children[$idx]) { - ++$idx; + if ($idx !== false && $idx > 0) { + return $this->parent->children[$idx - 1]; } - if (--$idx < 0) { return null; } - - return $this->parent->children[$idx]; + return null; } - /** - * Traverse ancestors to the first matching tag. - * - * @param string $tag Tag to find - * @return object|null First matching node in the DOM tree or null if no - * match was found. - * - * @todo Null is returned implicitly by calling ->parent on the root node. - * This behaviour could change at any time, rendering this function invalid. - */ function find_ancestor_tag($tag) { global $debug_object; if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } - // Start by including ourselves in the comparison. - $returnDom = $this; + if ($this->parent === null) { + return null; + } - while (!is_null($returnDom)) { + $ancestor = $this->parent; + + while (!is_null($ancestor)) { if (is_object($debug_object)) { - $debug_object->debug_log(2, 'Current tag is: ' . $returnDom->tag); + $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag); } - if ($returnDom->tag == $tag) { + if ($ancestor->tag === $tag) { break; } - $returnDom = $returnDom->parent; + $ancestor = $ancestor->parent; } - return $returnDom; + return $ancestor; } - /** - * Get node's inner text (everything inside the opening and closing tags) - * - * @return string - */ function innertext() { if (isset($this->_[HDOM_INFO_INNER])) { @@ -545,11 +356,6 @@ class simple_html_dom_node return $ret; } - /** - * Get node's outer text (everything including the opening and closing tags) - * - * @return string - */ function outertext() { global $debug_object; @@ -566,9 +372,11 @@ class simple_html_dom_node $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); } - if ($this->tag === 'root') return $this->innertext(); + if ($this->tag === 'root') { + return $this->innertext(); + } - // trigger callback + // todo: What is the use of this callback? Remove? if ($this->dom && $this->dom->callback !== null) { call_user_func_array($this->dom->callback, array($this)); } @@ -581,29 +389,23 @@ class simple_html_dom_node return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); } - // render begin tag + $ret = ''; + if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); - } else { - $ret = ''; } - // render inner text if (isset($this->_[HDOM_INFO_INNER])) { - // If it's a br tag... don't return the HDOM_INNER_INFO that we - // may or may not have added. + // todo:
should either never have HDOM_INFO_INNER or always if ($this->tag !== 'br') { $ret .= $this->_[HDOM_INFO_INNER]; } - } else { - if ($this->nodes) { - foreach ($this->nodes as $n) { - $ret .= $this->convert_text($n->outertext()); - } + } elseif ($this->nodes) { + foreach ($this->nodes as $n) { + $ret .= $this->convert_text($n->outertext()); } } - // render end tag if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { $ret .= 'tag . '>'; } @@ -611,11 +413,6 @@ class simple_html_dom_node return $ret; } - /** - * Get node's plain text (everything excluding all tags) - * - * @return string - */ function text() { if (isset($this->_[HDOM_INFO_INNER])) { @@ -642,7 +439,7 @@ class simple_html_dom_node foreach ($this->nodes as $n) { // Start paragraph after a blank line if ($n->tag === 'p') { - $ret .= "\n\n"; + $ret = trim($ret) . "\n\n"; } $ret .= $this->convert_text($n->text()); @@ -655,14 +452,9 @@ class simple_html_dom_node } } } - return trim($ret); + return $ret; } - /** - * Get node's xml text (inner text as a CDATA section) - * - * @return string - */ function xmltext() { $ret = $this->innertext(); @@ -671,7 +463,6 @@ class simple_html_dom_node return $ret; } - // build node's text with tag function makeup() { // text, comment, unknown @@ -715,18 +506,6 @@ class simple_html_dom_node return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; } - /** - * Find elements by CSS selector - * - * @param string $selector The CSS selector - * @param int|null $idx Index of element to return form the list of matching - * elements (default: `null` = disabled). - * @param bool $lowercase Matches tag names case insensitive (lowercase) if - * enabled (default: `false`) - * @return array|object|null A list of elements matching the specified CSS - * selector or a single element if $idx is specified or null if no element - * was found. - */ function find($selector, $idx = null, $lowercase = false) { $selectors = $this->parse_selector($selector); @@ -779,19 +558,6 @@ class simple_html_dom_node return (isset($found[$idx])) ? $found[$idx] : null; } - /** - * Seek DOM elements by selector - * - * **Note** - * The selector element must be compatible to a selector from - * {@see simple_html_dom_node::parse_selector()} - * - * @param array $selector A selector element - * @param array $ret An array of matches - * @param bool $lowercase Matches tag names case insensitive (lowercase) if - * enabled (default: `false`) - * @return void - */ protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) { global $debug_object; @@ -823,7 +589,8 @@ class simple_html_dom_node && $this->parent && in_array($this, $this->parent->children)) { // Next-Sibling Combinator $index = array_search($this, $this->parent->children, true) + 1; - $nodes[] = $this->parent->children[$index]; + if ($index < count($this->parent->children)) + $nodes[] = $this->parent->children[$index]; } elseif ($parent_cmd === '~' && $this->parent && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator @@ -1006,24 +773,6 @@ class simple_html_dom_node } } - /** - * Match value and pattern for a given CSS expression - * - * **Supported Expressions** - * - * | Expression | Description - * | ---------- | ----------- - * | `=` | $value and $pattern must be equal - * | `!=` | $value and $pattern must not be equal - * | `^=` | $value must start with $pattern - * | `$=` | $value must end with $pattern - * | `*=` | $value must contain $pattern - * - * @param string $exp The expression. - * @param string $pattern The pattern - * @param string $value The value - * @value bool True if $value matches $pattern - */ protected function match($exp, $pattern, $value, $case_sensitivity) { global $debug_object; @@ -1069,31 +818,6 @@ class simple_html_dom_node return false; } - /** - * Parse CSS selector - * - * @param string $selector_string CSS selector string - * @return array List of CSS selectors. The format depends on the type of - * selector: - * - * ```php - * - * array( // list of selectors (each separated by a comma), i.e. 'img, p, div' - * array( // list of combinator selectors, i.e. 'img > p > div' - * array( // selector element - * [0], // (string) The element tag - * [1], // (string) The element id - * [2], // (array) The element classes - * [3], // (array>) The list of attributes, each - * // with four elements: name, expression, value, inverted - * [4] // (string) The selector combinator (' ' | '>' | '+' | '~') - * ) - * ) - * ) - * ``` - * - * @link https://www.w3.org/TR/selectors/#compound Compound selector - */ protected function parse_selector($selector_string) { global $debug_object; @@ -1185,7 +909,7 @@ class simple_html_dom_node */ if($m[4] !== '') { preg_match_all( - "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s*?([iIsS])?)?\]/is", + "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", trim($m[4]), $attributes, PREG_SET_ORDER @@ -1285,8 +1009,6 @@ class simple_html_dom_node if (isset($this->attr[$name])) { unset($this->attr[$name]); } } - // PaperG - Function to convert the text from one character set to another - // if the two sets are not the same. function convert_text($text) { global $debug_object; @@ -1337,12 +1059,6 @@ class simple_html_dom_node return $converted_text; } - /** - * Returns true if $string is valid UTF-8 and false otherwise. - * - * @param mixed $str String to be tested - * @return boolean - */ static function is_utf8($str) { $c = 0; $b = 0; @@ -1370,16 +1086,6 @@ class simple_html_dom_node return true; } - /** - * Function to try a few tricks to determine the displayed size of an img on - * the page. NOTE: This will ONLY work on an IMG tag. Returns FALSE on all - * other tag types. - * - * @author John Schlick - * @version April 19 2012 - * @return array an array containing the 'height' and 'width' of the image - * on the page or -1 if we can't figure it out. - */ function get_display_size() { global $debug_object; @@ -1465,7 +1171,82 @@ class simple_html_dom_node return $result; } - // camel naming conventions + function save($filepath = '') + { + $ret = $this->outertext(); + + if ($filepath !== '') { + file_put_contents($filepath, $ret, LOCK_EX); + } + + return $ret; + } + + function addClass($class) + { + if (is_string($class)) { + $class = explode(' ', $class); + } + + if (is_array($class)) { + foreach($class as $c) { + if (isset($this->class)) { + if ($this->hasClass($c)) { + continue; + } else { + $this->class .= ' ' . $c; + } + } else { + $this->class = $c; + } + } + } else { + if (is_object($debug_object)) { + $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); + } + } + } + + function hasClass($class) + { + if (is_string($class)) { + if (isset($this->class)) { + return in_array($class, explode(' ', $this->class), true); + } + } else { + if (is_object($debug_object)) { + $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); + } + } + + return false; + } + + function removeClass($class = null) + { + if (!isset($this->class)) { + return; + } + + if (is_null($class)) { + $this->removeAttribute('class'); + return; + } + + if (is_string($class)) { + $class = explode(' ', $class); + } + + if (is_array($class)) { + $class = array_diff(explode(' ', $this->class), $class); + if (empty($class)) { + $this->removeAttribute('class'); + } else { + $this->class = implode(' ', $class); + } + } + } + function getAllAttributes() { return $this->attr; @@ -1491,6 +1272,44 @@ class simple_html_dom_node $this->__set($name, null); } + function remove() + { + if ($this->parent) { + $this->parent->removeChild($this); + } + } + + function removeChild($node) + { + $nidx = array_search($node, $this->nodes, true); + $cidx = array_search($node, $this->children, true); + $didx = array_search($node, $this->dom->nodes, true); + + if ($nidx !== false && $cidx !== false && $didx !== false) { + + foreach($node->children as $child) { + $node->removeChild($child); + } + + foreach($node->nodes as $entity) { + $enidx = array_search($entity, $node->nodes, true); + $edidx = array_search($entity, $node->dom->nodes, true); + + if ($enidx !== false && $edidx !== false) { + unset($node->nodes[$enidx]); + unset($node->dom->nodes[$edidx]); + } + } + + unset($this->nodes[$nidx]); + unset($this->children[$cidx]); + unset($this->dom->nodes[$didx]); + + $node->clear(); + + } + } + function getElementById($id) { return $this->find("#$id", 0); @@ -1559,170 +1378,34 @@ class simple_html_dom_node } -/** - * simple html dom parser - * - * Paperg - in the find routine: allow us to specify that we want case - * insensitive testing of the value of the selector. - * - * Paperg - change $size from protected to public so we can easily access it - * - * Paperg - added ForceTagsClosed in the constructor which tells us whether we - * trust the html or not. Default is to NOT trust it. - * - * @package PlaceLocalInclude - */ class simple_html_dom { - /** - * The root node of the document - * - * @var object - */ public $root = null; - - /** - * List of nodes in the current DOM - * - * @var array - */ public $nodes = array(); - - /** - * Callback function to run for each element in the DOM. - * - * @var callable|null - */ public $callback = null; - - /** - * Indicates how tags and attributes are matched - * - * @var bool When set to **true** tags and attributes will be converted to - * lowercase before matching. - */ public $lowercase = false; - - /** - * Original document size - * - * Holds the original document size. - * - * @var int - */ public $original_size; - - /** - * Current document size - * - * Holds the current document size. The document size is determined by the - * string length of ({@see simple_html_dom::$doc}). - * - * _Note_: Using this variable is more efficient than calling `strlen($doc)` - * - * @var int - * */ public $size; - /** - * Current position in the document - * - * @var int - */ protected $pos; - - /** - * The document - * - * @var string - */ protected $doc; - - /** - * Current character - * - * Holds the current character at position {@see simple_html_dom::$pos} in - * the document {@see simple_html_dom::$doc} - * - * _Note_: Using this variable is more efficient than calling - * `substr($doc, $pos, 1)` - * - * @var string - */ protected $char; protected $cursor; - - /** - * Parent node of the next node detected by the parser - * - * @var object - */ protected $parent; protected $noise = array(); - - /** - * Tokens considered blank in HTML - * - * @var string - */ protected $token_blank = " \t\r\n"; - - /** - * Tokens to identify the equal sign for attributes, stopping either at the - * closing tag ("/" i.e. "") or the end of an opening tag (">" i.e. - * "") - * - * @var string - */ protected $token_equal = ' =/>'; - - /** - * Tokens to identify the end of a tag name. A tag name either ends on the - * ending slash ("/" i.e. "") or whitespace ("\s\r\n\t") - * - * @var string - */ protected $token_slash = " />\r\n\t"; - - /** - * Tokens to identify the end of an attribute - * - * @var string - */ protected $token_attr = ' >'; - // Note that this is referenced by a child node, and so it needs to be - // public for that node to see this information. public $_charset = ''; public $_target_charset = ''; - /** - * Innertext for
elements - * - * @var string - */ protected $default_br_text = ''; - /** - * Suffix for elements - * - * @var string - */ public $default_span_text = ''; - /** - * Defines a list of self-closing tags (Void elements) according to the HTML - * Specification - * - * _Remarks_: - * - Use `isset()` instead of `in_array()` on array elements to boost - * performance about 30% - * - Sort elements by name for better readability! - * - * @link https://www.w3.org/TR/html HTML Specification - * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements - */ protected $self_closing_tags = array( 'area' => 1, 'base' => 1, @@ -1739,18 +1422,6 @@ class simple_html_dom 'track' => 1, 'wbr' => 1 ); - - /** - * Defines a list of tags which - if closed - close all optional closing - * elements within if they haven't been closed yet. (So, an element where - * neither opening nor closing tag is omissible consistently closes every - * optional closing element within) - * - * _Remarks_: - * - Use `isset()` instead of `in_array()` on array elements to boost - * performance about 30% - * - Sort elements by name for better readability! - */ protected $block_tags = array( 'body' => 1, 'div' => 1, @@ -1759,62 +1430,6 @@ class simple_html_dom 'span' => 1, 'table' => 1 ); - - /** - * Defines elements whose end tag is omissible. - * - * * key = Name of an element whose end tag is omissible. - * * value = Names of elements whose end tag is omissible, that are closed - * by the current element. - * - * _Remarks_: - * - Use `isset()` instead of `in_array()` on array elements to boost - * performance about 30% - * - Sort elements by name for better readability! - * - * **Example** - * - * An `li` element’s end tag may be omitted if the `li` element is immediately - * followed by another `li` element. To do that, add following element to the - * array: - * - * ```php - * 'li' => array('li'), - * ``` - * - * With this, the following two examples are considered equal. Note that the - * second example is missing the closing tags on `li` elements. - * - * ```html - *
  • First Item
  • Second Item
- * ``` - * - *
  • First Item
  • Second Item
- * - * ```html - *
  • First Item
  • Second Item
- * ``` - * - *
  • First Item
  • Second Item
- * - * @var array A two-dimensional array where the key is the name of an - * element whose end tag is omissible and the value is an array of elements - * whose end tag is omissible, that are closed by the current element. - * - * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags - * - * @todo The implementation of optional closing tags doesn't work in all cases - * because it only consideres elements who close other optional closing - * tags, not taking into account that some (non-blocking) tags should close - * these optional closing tags. For example, the end tag for "p" is omissible - * and can be closed by an "address" element, whose end tag is NOT omissible. - * Currently a "p" element without closing tag stops at the next "p" element - * or blocking tag, even if it contains other elements. - * - * @todo Known sourceforge issue #2977341 - * B tags that are not closed cause us to return everything to the end of - * the document. - */ protected $optional_closing_tags = array( // Not optional, see // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element @@ -1873,7 +1488,6 @@ class simple_html_dom $this->clear(); } - // load html from string function load( $str, $lowercase = true, @@ -1928,7 +1542,6 @@ class simple_html_dom return $this; } - // load html from file function load_file() { $args = func_get_args(); @@ -1940,29 +1553,16 @@ class simple_html_dom } } - /** - * Set the callback function - * - * @param callable $function_name Callback function to run for each element - * in the DOM. - * @return void - */ function set_callback($function_name) { $this->callback = $function_name; } - /** - * Remove callback function - * - * @return void - */ function remove_callback() { $this->callback = null; } - // save dom as string function save($filepath = '') { $ret = $this->root->innertext(); @@ -1970,18 +1570,18 @@ class simple_html_dom return $ret; } - // find dom node by css selector - // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. function find($selector, $idx = null, $lowercase = false) { return $this->root->find($selector, $idx, $lowercase); } - // clean up memory due to php5 circular references memory leak... function clear() { - foreach ($this->nodes as $n) { - $n->clear(); $n = null; + if (isset($this->nodes)) { + foreach ($this->nodes as $n) { + $n->clear(); + $n = null; + } } // This add next line is documented in the sourceforge repository. @@ -1989,7 +1589,8 @@ class simple_html_dom // use of clear. if (isset($this->children)) { foreach ($this->children as $n) { - $n->clear(); $n = null; + $n->clear(); + $n = null; } } @@ -2012,7 +1613,6 @@ class simple_html_dom $this->root->dump($show_attr); } - // prepare HTML data and init everything protected function prepare( $str, $lowercase = true, $defaultBRText = DEFAULT_BR_TEXT, @@ -2038,11 +1638,6 @@ class simple_html_dom if ($this->size > 0) { $this->char = $this->doc[0]; } } - /** - * Parse HTML content - * - * @return bool True on success - */ protected function parse() { while (true) { @@ -2064,13 +1659,6 @@ class simple_html_dom } } - // PAPERG - dkchou - added this to try to identify the character set of the - // page we have just parsed so we know better how to spit it out later. - // NOTE: IF you provide a routine called - // get_last_retrieve_url_contents_content_type which returns the - // CURLINFO_CONTENT_TYPE from the last curl_exec - // (or the content_type header from the last transfer), we will parse THAT, - // and if a charset is specified, we will use it over any other mechanism. protected function parse_charset() { global $debug_object; @@ -2092,6 +1680,7 @@ class simple_html_dom } if (empty($charset)) { + // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); if (!empty($el)) { @@ -2128,53 +1717,77 @@ class simple_html_dom } } - // If we couldn't find a charset above, then lets try to detect one - // based on the text we got... if (empty($charset)) { - // Use this in case mb_detect_charset isn't installed/loaded on - // this machine. - $charset = false; - if (function_exists('mb_detect_encoding')) { - // Have php try to detect the encoding from the text given to us. - $charset = mb_detect_encoding( - $this->doc . 'ascii', - $encoding_list = array( 'UTF-8', 'CP1252' ) - ); - + // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration + if ($meta = $this->root->find('meta[charset]', 0)) { + $charset = $meta->charset; if (is_object($debug_object)) { - $debug_object->debug_log(2, 'mb_detect found: ' . $charset); + $debug_object->debug_log(2, 'meta charset: ' . $charset); } } + } - // and if this doesn't work... then we need to just wrongheadedly - // assume it's UTF-8 so that we can move on - cause this will - // usually give us most of what we need... - if ($charset === false) { - if (is_object($debug_object)) { - $debug_object->debug_log( - 2, - 'since mb_detect failed - using default of utf-8' - ); + if (empty($charset)) { + // Try to guess the charset based on the content + // Requires Multibyte String (mbstring) support (optional) + if (function_exists('mb_detect_encoding')) { + /** + * mb_detect_encoding() is not intended to distinguish between + * charsets, especially single-byte charsets. Its primary + * purpose is to detect which multibyte encoding is in use, + * i.e. UTF-8, UTF-16, shift-JIS, etc. + * + * -- https://bugs.php.net/bug.php?id=38138 + * + * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will + * always result in CP1251/ISO-8859-5 and vice versa. + * + * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 + * to stay compatible. + */ + $encoding = mb_detect_encoding( + $this->doc, + array( 'UTF-8', 'CP1252', 'ISO-8859-1' ) + ); + + if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') { + // Due to a limitation of mb_detect_encoding + // 'CP1251'/'ISO-8859-5' will be detected as + // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in + // which case we can simply assume it is the other charset. + if (!@iconv('CP1252', 'UTF-8', $this->doc)) { + $encoding = 'CP1251'; + } } - $charset = 'UTF-8'; + if ($encoding !== false) { + $charset = $encoding; + if (is_object($debug_object)) { + $debug_object->debug_log(2, 'mb_detect: ' . $charset); + } + } + } + } + + if (empty($charset)) { + // Assume it's UTF-8 as it is the most likely charset to be used + $charset = 'UTF-8'; + if (is_object($debug_object)) { + $debug_object->debug_log(2, 'No match found, assume ' . $charset); } } // Since CP1252 is a superset, if we get one of it's subsets, we want // it instead. - if ((strtolower($charset) == strtolower('ISO-8859-1')) - || (strtolower($charset) == strtolower('Latin1')) - || (strtolower($charset) == strtolower('Latin-1'))) { - + if ((strtolower($charset) == 'iso-8859-1') + || (strtolower($charset) == 'latin1') + || (strtolower($charset) == 'latin-1')) { + $charset = 'CP1252'; if (is_object($debug_object)) { - $debug_object->debug_log( - 2, + $debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset' ); } - - $charset = 'CP1252'; } if (is_object($debug_object)) { @@ -2184,11 +1797,6 @@ class simple_html_dom return $this->_charset = $charset; } - /** - * Parse tag from current document position. - * - * @return bool True if a tag was found, false otherwise - */ protected function read_tag() { // Set end position if no further tags found @@ -2467,63 +2075,50 @@ class simple_html_dom return true; } - /** - * Parse attribute from current document position - * - * @param object $node Node for the attributes - * @param string $name Name of the current attribute - * @param array $space Array for spacing information - * @return void - */ protected function parse_attr($node, $name, &$space) { - // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 - // If the attribute is already defined inside a tag, only pay attention - // to the first one as opposed to the last one. - // https://stackoverflow.com/a/26341866 - if (isset($node->attr[$name])) { - return; - } + $is_duplicate = isset($node->attr[$name]); - // [2] Whitespace between "=" and the value - $space[2] = $this->copy_skip($this->token_blank); + if (!$is_duplicate) // Copy whitespace between "=" and value + $space[2] = $this->copy_skip($this->token_blank); switch ($this->char) { - case '"': // value is anything between double quotes - $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; + case '"': + $quote_type = HDOM_QUOTE_DOUBLE; $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next - $node->attr[$name] = $this->restore_noise($this->copy_until_char('"')); + $value = $this->copy_until_char('"'); $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next break; - case '\'': // value is anything between single quotes - $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; + case '\'': + $quote_type = HDOM_QUOTE_SINGLE; $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next - $node->attr[$name] = $this->restore_noise($this->copy_until_char('\'')); + $value = $this->copy_until_char('\''); $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next break; - default: // value is anything until the first space or end tag - $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; - $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); + default: + $quote_type = HDOM_QUOTE_NO; + $value = $this->copy_until($this->token_attr); } + + $value = $this->restore_noise($value); + // PaperG: Attributes should not have \r or \n in them, that counts as // html whitespace. - $node->attr[$name] = str_replace("\r", '', $node->attr[$name]); - $node->attr[$name] = str_replace("\n", '', $node->attr[$name]); + $value = str_replace("\r", '', $value); + $value = str_replace("\n", '', $value); + // PaperG: If this is a "class" selector, lets get rid of the preceeding // and trailing space since some people leave it in the multi class case. if ($name === 'class') { - $node->attr[$name] = trim($node->attr[$name]); + $value = trim($value); + } + + if (!$is_duplicate) { + $node->_[HDOM_INFO_QUOTE][] = $quote_type; + $node->attr[$name] = $value; } } - /** - * Link node to parent node - * - * @param object $node Node to link to parent - * @param bool $is_child True if the node is a child of parent - * @return void - */ - // link node's parent protected function link_nodes(&$node, $is_child) { $node->parent = $this->parent; @@ -2533,12 +2128,6 @@ class simple_html_dom } } - /** - * Add tag as text node to current node - * - * @param string $tag Tag name - * @return bool True on success - */ protected function as_text_node($tag) { $node = new simple_html_dom_node($this); @@ -2549,28 +2138,12 @@ class simple_html_dom return true; } - /** - * Seek from the current document position to the first occurrence of a - * character not defined by the provided string. Update the current document - * position to the new position. - * - * @param string $chars A string containing every allowed character. - * @return void - */ protected function skip($chars) { $this->pos += strspn($this->doc, $chars, $this->pos); $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next } - /** - * Copy substring from the current document position to the first occurrence - * of a character not defined by the provided string. - * - * @param string $chars A string containing every allowed character. - * @return string Substring from the current document position to the first - * occurrence of a character not defined by the provided string. - */ protected function copy_skip($chars) { $pos = $this->pos; @@ -2581,14 +2154,6 @@ class simple_html_dom return substr($this->doc, $pos, $len); } - /** - * Copy substring from the current document position to the first occurrence - * of any of the provided characters. - * - * @param string $chars A string containing every character to stop at. - * @return string Substring from the current document position to the first - * occurrence of any of the provided characters. - */ protected function copy_until($chars) { $pos = $this->pos; @@ -2598,14 +2163,6 @@ class simple_html_dom return substr($this->doc, $pos, $len); } - /** - * Copy substring from the current document position to the first occurrence - * of the provided string. - * - * @param string $char The string to stop at. - * @return string Substring from the current document position to the first - * occurrence of the provided string. - */ protected function copy_until_char($char) { if ($this->char === null) { return ''; } @@ -2625,15 +2182,6 @@ class simple_html_dom return substr($this->doc, $pos_old, $pos - $pos_old); } - /** - * Remove noise from HTML content - * - * Noise is stored to {@see simple_html_dom::$noise} - * - * @param string $pattern The regex pattern used for finding noise - * @param bool $remove_tag True to remove the entire match. Default is false - * to only remove the captured data. - */ protected function remove_noise($pattern, $remove_tag = false) { global $debug_object; @@ -2666,14 +2214,6 @@ class simple_html_dom } } - /** - * Restore noise to HTML content - * - * Noise is restored from {@see simple_html_dom::$noise} - * - * @param string $text A subset of HTML containing noise - * @return string The same content with noise restored - */ function restore_noise($text) { global $debug_object; @@ -2720,7 +2260,6 @@ class simple_html_dom return $text; } - // Sometimes we NEED one of the noise elements. function search_noise($text) { global $debug_object; @@ -2754,7 +2293,6 @@ class simple_html_dom } } - // camel naming conventions function childNodes($idx = -1) { return $this->root->childNodes($idx); @@ -2772,7 +2310,7 @@ class simple_html_dom function createElement($name, $value = null) { - return @str_get_html("<$name>$value")->first_child(); + return @str_get_html("<$name>$value")->firstChild(); } function createTextNode($value) From 014b698f6751655d3efe047cb0b951641936b9fe Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 1 Jun 2019 21:05:10 +0200 Subject: [PATCH 12/42] [html] Use find('*') over custom solution find('*') wasn't supported in older versions of simplehtmldom but it is supported now. Thus, all custom implementations can be replaced by the correct solution. --- lib/html.php | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/lib/html.php b/lib/html.php index 0778c640..13db97a4 100644 --- a/lib/html.php +++ b/lib/html.php @@ -32,18 +32,7 @@ function sanitize($html, $htmlContent = str_get_html($html); - /* - * Notice: simple_html_dom currently doesn't support "->find(*)", which is a - * known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/ - * - * A solution to this is to find all nodes WITHOUT a specific attribute. If - * the attribute is very unlikely to appear in the DOM, this is essentially - * returning all nodes. - * - * "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib - * "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM. - */ - foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) { + foreach($htmlContent->find('*') as $element) { if(in_array($element->tag, $text_to_keep)) { $element->outertext = $element->plaintext; } elseif(in_array($element->tag, $tags_to_remove)) { @@ -90,18 +79,7 @@ function backgroundToImg($htmlContent) { $regex = '/background-image[ ]{0,}:[ ]{0,}url\([\'"]{0,}(.*?)[\'"]{0,}\)/'; $htmlContent = str_get_html($htmlContent); - /* - * Notice: simple_html_dom currently doesn't support "->find(*)", which is a - * known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/ - * - * A solution to this is to find all nodes WITHOUT a specific attribute. If - * the attribute is very unlikely to appear in the DOM, this is essentially - * returning all nodes. - * - * "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib - * "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM. - */ - foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) { + foreach($htmlContent->find('*') as $element) { if(preg_match($regex, $element->style, $matches) > 0) { From 052844f5e13c71ceefd743136a71f71226a0eefb Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 1 Jun 2019 21:15:30 +0200 Subject: [PATCH 13/42] all: Use ->remove() instead of ->outertext = '' simplehtmldom 1.9 introduced new functions to recursively remove nodes from the DOM. This allows removing elements without the need to re-load the document by using $html->load($html->save()), which is very inefficient. Find more information about remove() at https://simplehtmldom.sourceforge.io/docs/1.9/api/simple_html_dom_node/remove/ --- bridges/AsahiShimbunAJWBridge.php | 6 +++--- bridges/BundesbankBridge.php | 2 +- bridges/CastorusBridge.php | 2 +- bridges/DauphineLibereBridge.php | 2 +- bridges/EconomistBridge.php | 6 +++--- bridges/FacebookBridge.php | 2 +- bridges/HaveIBeenPwnedBridge.php | 2 +- bridges/JustETFBridge.php | 8 ++++---- bridges/NextgovBridge.php | 2 +- bridges/OsmAndBlogBridge.php | 2 +- bridges/PikabuBridge.php | 2 +- bridges/RadioMelodieBridge.php | 10 +++------- bridges/SIMARBridge.php | 2 +- bridges/ScmbBridge.php | 2 +- bridges/TwitterBridge.php | 2 +- bridges/VkBridge.php | 27 +++++++++++++-------------- bridges/WikipediaBridge.php | 6 +++--- bridges/WordPressBridge.php | 2 +- bridges/WorldOfTanksBridge.php | 2 +- bridges/XenForoBridge.php | 2 +- lib/html.php | 2 +- 21 files changed, 44 insertions(+), 49 deletions(-) diff --git a/bridges/AsahiShimbunAJWBridge.php b/bridges/AsahiShimbunAJWBridge.php index 0ceb0381..62b9739d 100644 --- a/bridges/AsahiShimbunAJWBridge.php +++ b/bridges/AsahiShimbunAJWBridge.php @@ -50,18 +50,18 @@ class AsahiShimbunAJWBridge extends BridgeAbstract { $e_lead = $element->find('span.Lead', 0); if ($e_lead) { $item['content'] = $e_lead->innertext; - $e_lead->outertext = ''; + $e_lead->remove(); } else { $item['content'] = $element->innertext; } $e_date = $element->find('span.EnDate', 0); if ($e_date) { $item['timestamp'] = strtotime($e_date->innertext); - $e_date->outertext = ''; + $e_date->remove(); } $e_video = $element->find('span.EnVideo', 0); if ($e_video) { - $e_video->outertext = ''; + $e_video->remove(); $element->innertext = "VIDEO: $element->innertext"; } $item['title'] = $element->innertext; diff --git a/bridges/BundesbankBridge.php b/bridges/BundesbankBridge.php index b64a6425..d78873c6 100644 --- a/bridges/BundesbankBridge.php +++ b/bridges/BundesbankBridge.php @@ -55,7 +55,7 @@ class BundesbankBridge extends BridgeAbstract { $title = $study->find('.teasable__title div.h2', 0); foreach($title->children as &$child) { - $child->outertext = ''; + $child->remove(); } $item['title'] = $title->innertext; diff --git a/bridges/CastorusBridge.php b/bridges/CastorusBridge.php index 3ed1331e..48af9696 100644 --- a/bridges/CastorusBridge.php +++ b/bridges/CastorusBridge.php @@ -58,7 +58,7 @@ class CastorusBridge extends BridgeAbstract { returnServerError('Cannot find nodes!'); foreach($nodes as $node) { - $node->outertext = ''; + $node->remove(); } return strtotime($activity->innertext); diff --git a/bridges/DauphineLibereBridge.php b/bridges/DauphineLibereBridge.php index 20c82070..1ff25106 100644 --- a/bridges/DauphineLibereBridge.php +++ b/bridges/DauphineLibereBridge.php @@ -50,7 +50,7 @@ class DauphineLibereBridge extends FeedExpander { private function extractContent($url){ $html2 = getSimpleHTMLDOMCached($url); foreach ($html2->find('.noprint, link, script, iframe, .shareTool, .contentInfo') as $remove) { - $remove->outertext = ''; + $remove->remove(); } return $html2->find('div.content', 0)->innertext; } diff --git a/bridges/EconomistBridge.php b/bridges/EconomistBridge.php index 1256be45..19b2a832 100644 --- a/bridges/EconomistBridge.php +++ b/bridges/EconomistBridge.php @@ -29,16 +29,16 @@ class EconomistBridge extends BridgeAbstract { // Remove newsletter subscription box $newsletter = $content->find('div[class="newsletter-form__message"]', 0); if ($newsletter) - $newsletter->outertext = ''; + $newsletter->remove(); $newsletterForm = $content->find('form', 0); if ($newsletterForm) - $newsletterForm->outertext = ''; + $newsletterForm->remove(); // Remove next and previous article URLs at the bottom $nextprev = $content->find('div[class="blog-post__next-previous-wrapper"]', 0); if ($nextprev) - $nextprev->outertext = ''; + $nextprev->remove(); $section = [ $article->find('h3[itemprop="articleSection"]', 0)->plaintext ]; diff --git a/bridges/FacebookBridge.php b/bridges/FacebookBridge.php index c0901072..a0331da9 100644 --- a/bridges/FacebookBridge.php +++ b/bridges/FacebookBridge.php @@ -584,7 +584,7 @@ EOD; foreach($content_filters as $filter) { foreach($content->find($filter) as $subject) { - $subject->outertext = ''; + $subject->remove(); } } diff --git a/bridges/HaveIBeenPwnedBridge.php b/bridges/HaveIBeenPwnedBridge.php index f256623a..8fac1e33 100644 --- a/bridges/HaveIBeenPwnedBridge.php +++ b/bridges/HaveIBeenPwnedBridge.php @@ -50,7 +50,7 @@ class HaveIBeenPwnedBridge extends BridgeAbstract { $permalink = $breach->find('p', 1)->find('a', 0)->href; // Remove permalink - $breach->find('p', 1)->find('a', 0)->outertext = ''; + $breach->find('p', 1)->find('a', 0)->remove(); $item['title'] = $breach->find('h3', 0)->plaintext . ' - ' . $accounts[1] . ' breached accounts'; $item['dateAdded'] = strtotime($dateAdded[1]); diff --git a/bridges/JustETFBridge.php b/bridges/JustETFBridge.php index 8d5b3d5a..c9201e4b 100644 --- a/bridges/JustETFBridge.php +++ b/bridges/JustETFBridge.php @@ -239,16 +239,16 @@ class JustETFBridge extends BridgeAbstract { or returnServerError('Article body not found!'); // Remove teaser image - $element->find('img.teaser-img', 0)->outertext = ''; + $element->find('img.teaser-img', 0)->remove(); // Remove self advertisements foreach($element->find('.call-action') as $adv) { - $adv->outertext = ''; + $adv->remove(); } // Remove tips foreach($element->find('.panel-edu') as $tip) { - $tip->outertext = ''; + $tip->remove(); } // Remove inline scripts (used for i.e. interactive graphs) as they are @@ -318,7 +318,7 @@ class JustETFBridge extends BridgeAbstract { $description = $description->parent(); foreach($description->find('div') as $div) { - $div->outertext = ''; + $div->remove(); } $quote = $html->find('div.infobox div.val', 0) diff --git a/bridges/NextgovBridge.php b/bridges/NextgovBridge.php index 74bfc54a..5e393457 100644 --- a/bridges/NextgovBridge.php +++ b/bridges/NextgovBridge.php @@ -61,7 +61,7 @@ class NextgovBridge extends FeedExpander { return 'Could not request Nextgov: ' . $url; $contents = $article->find('div.wysiwyg', 0); - $contents->find('svg.content-tombstone', 0)->outertext = ''; + $contents->find('svg.content-tombstone', 0)->remove(); $contents = $contents->innertext; $contents = stripWithDelimiters($contents, '
', '
'); $contents = stripWithDelimiters($contents, ''); //ad outer div diff --git a/bridges/OsmAndBlogBridge.php b/bridges/OsmAndBlogBridge.php index 402c0301..25e765f5 100644 --- a/bridges/OsmAndBlogBridge.php +++ b/bridges/OsmAndBlogBridge.php @@ -51,7 +51,7 @@ class OsmAndBlogBridge extends BridgeAbstract { private function cleanupContent($content, ...$removeItems) { foreach ($removeItems as $obj) { - if ($obj) $obj->outertext = ''; + if ($obj) $obj->remove(); } foreach ($content->find('img') as $obj) { $obj->src = $this->filterURL($obj->src); diff --git a/bridges/PikabuBridge.php b/bridges/PikabuBridge.php index af603aca..1e1d5c8e 100644 --- a/bridges/PikabuBridge.php +++ b/bridges/PikabuBridge.php @@ -63,7 +63,7 @@ class PikabuBridge extends BridgeAbstract { foreach($el_to_remove_selectors as $el_to_remove_selector) { foreach($post->find($el_to_remove_selector) as $el) { - $el->outertext = ''; + $el->remove(); } } diff --git a/bridges/RadioMelodieBridge.php b/bridges/RadioMelodieBridge.php index fb5aca6e..8e2cf05d 100644 --- a/bridges/RadioMelodieBridge.php +++ b/bridges/RadioMelodieBridge.php @@ -38,20 +38,17 @@ class RadioMelodieBridge extends BridgeAbstract { $imgs = $textDOM->find('img[src^="http://www.radiomelodie.com/image.php]'); foreach($imgs as $img) { $img->src = $this->rewriteImage($img->src); - $article->save(); } // Remove Google Ads $ads = $article->find('div[class=adInline]'); foreach($ads as $ad) { - $ad->outertext = ''; - $article->save(); + $ad->remove(); } // Remove Radio Melodie Logo $logoHTML = $article->find('div[id=logoArticleRM]', 0); - $logoHTML->outertext = ''; - $article->save(); + $logoHTML->remove(); $author = $article->find('p[class=AuthorName]', 0)->plaintext; @@ -65,8 +62,7 @@ class RadioMelodieBridge extends BridgeAbstract { $header = ''; // Remove the Date and Author part - $textDOM->find('div[class=AuthorDate]', 0)->outertext = ''; - $article->save(); + $textDOM->find('div[class=AuthorDate]', 0)->remove(); $text = $textDOM->innertext; $item['content'] = '

' . $item['title'] . '

' . $date . '
' . $header . $text; $this->items[] = $item; diff --git a/bridges/SIMARBridge.php b/bridges/SIMARBridge.php index 1e446cf5..41d517b4 100644 --- a/bridges/SIMARBridge.php +++ b/bridges/SIMARBridge.php @@ -48,7 +48,7 @@ class SIMARBridge extends BridgeAbstract { foreach($e_item->find('p') as $paragraph) { /* Remove empty paragraphs */ if (preg_match('/^(\W| )+$/', $paragraph->innertext) == 1) { - $paragraph->outertext = ''; + $paragraph->remove(); } } if ($e_item) { diff --git a/bridges/ScmbBridge.php b/bridges/ScmbBridge.php index 2107aa3d..65fbbf01 100644 --- a/bridges/ScmbBridge.php +++ b/bridges/ScmbBridge.php @@ -18,7 +18,7 @@ class ScmbBridge extends BridgeAbstract { $item['title'] = $article->find('header h1 a', 0)->innertext; // remove text "En savoir plus" from anecdote content - $article->find('span.read-more', 0)->outertext = ''; + $article->find('span.read-more', 0)->remove(); $content = $article->find('p.summary a', 0)->innertext; // remove superfluous spaces at the end diff --git a/bridges/TwitterBridge.php b/bridges/TwitterBridge.php index b3b7bed4..f3ba39c1 100644 --- a/bridges/TwitterBridge.php +++ b/bridges/TwitterBridge.php @@ -171,7 +171,7 @@ class TwitterBridge extends BridgeAbstract { // remove 'invisible' content foreach($tweet->find('.invisible') as $invisible) { - $invisible->outertext = ''; + $invisible->remove(); } // Skip protmoted tweets diff --git a/bridges/VkBridge.php b/bridges/VkBridge.php index 8653e7c9..5274180f 100644 --- a/bridges/VkBridge.php +++ b/bridges/VkBridge.php @@ -62,9 +62,8 @@ class VkBridge extends BridgeAbstract $this->pageName = htmlspecialchars_decode($pageName); } foreach ($html->find('div.replies') as $comment_block) { - $comment_block->outertext = ''; + $comment_block->remove(); } - $html->load($html->save()); $pinned_post_item = null; $last_post_id = 0; @@ -82,7 +81,7 @@ class VkBridge extends BridgeAbstract if (is_object($post->find('a.wall_post_more', 0))) { //delete link "show full" in content - $post->find('a.wall_post_more', 0)->outertext = ''; + $post->find('a.wall_post_more', 0)->remove(); } $content_suffix = ''; @@ -114,7 +113,7 @@ class VkBridge extends BridgeAbstract foreach($external_link_selectors_to_remove as $sel) { if (is_object($post->find($sel, 0))) { - $post->find($sel, 0)->outertext = ''; + $post->find($sel, 0)->remove(); } } @@ -140,7 +139,7 @@ class VkBridge extends BridgeAbstract $content_suffix .= "
"; } $content_suffix .= "
Article: $article_title ($article_author)"; - $article->outertext = ''; + $article->remove(); } // get video on post @@ -150,7 +149,7 @@ class VkBridge extends BridgeAbstract $video_title = $video->find('div.post_video_title', 0)->plaintext; $video_link = $video->find('a.lnk', 0)->getAttribute('href'); $this->appendVideo($video_title, $video_link, $content_suffix, $post_videos); - $video->outertext = ''; + $video->remove(); $main_video_link = $video_link; } @@ -161,14 +160,14 @@ class VkBridge extends BridgeAbstract if (count($temp) > 1) $video_title = $temp[1]; $video_link = $a->getAttribute('href'); if ($video_link != $main_video_link) $this->appendVideo($video_title, $video_link, $content_suffix, $post_videos); - $a->outertext = ''; + $a->remove(); } // get all photos foreach($post->find('div.wall_text > a.page_post_thumb_wrap') as $a) { $result = $this->getPhoto($a); if ($result == null) continue; - $a->outertext = ''; + $a->remove(); $content_suffix .= "
$result"; } @@ -177,7 +176,7 @@ class VkBridge extends BridgeAbstract $a = $el->find('.page_album_link', 0); $album_title = $a->find('.page_album_title_text', 0)->getAttribute('title'); $album_link = $a->getAttribute('href'); - $el->outertext = ''; + $el->remove(); $content_suffix .= "
Album: $album_title"; } @@ -200,7 +199,7 @@ class VkBridge extends BridgeAbstract } - $a->outertext = ''; + $a->remove(); } // get other documents @@ -217,7 +216,7 @@ class VkBridge extends BridgeAbstract } - $div->outertext = ''; + $div->remove(); } // get polls @@ -227,14 +226,14 @@ class VkBridge extends BridgeAbstract foreach($div->find('div.page_poll_text') as $poll_stat_title) { $content_suffix .= '
- ' . $poll_stat_title->innertext; } - $div->outertext = ''; + $div->remove(); } // get sign $post_author = $pageName; foreach($post->find('a.wall_signed_by') as $a) { $post_author = $a->innertext; - $a->outertext = ''; + $a->remove(); } if (is_object($post->find('div.copy_quote', 0))) { @@ -243,7 +242,7 @@ class VkBridge extends BridgeAbstract } $copy_quote = $post->find('div.copy_quote', 0); if ($copy_post_header = $copy_quote->find('div.copy_post_header', 0)) { - $copy_post_header->outertext = ''; + $copy_post_header->remove(); } $copy_quote_content = $copy_quote->innertext; $copy_quote->outertext = "
Reposted:
$copy_quote_content"; diff --git a/bridges/WikipediaBridge.php b/bridges/WikipediaBridge.php index 7ca763fc..a53652dd 100644 --- a/bridges/WikipediaBridge.php +++ b/bridges/WikipediaBridge.php @@ -141,7 +141,7 @@ class WikipediaBridge extends BridgeAbstract { $anchorFallbackIndex = 0){ // Clean the bottom of the featured article if ($element->find('div', -1)) - $element->find('div', -1)->outertext = ''; + $element->find('div', -1)->remove(); // The title and URI of the article can be found in an anchor containing // the string '...' in most wikis ('full article ...') @@ -202,10 +202,10 @@ class WikipediaBridge extends BridgeAbstract { // Let's remove a couple of things from the article $table = $content->find('#toc', 0); // Table of contents if(!$table === false) - $table->outertext = ''; + $table->remove(); foreach($content->find('ol.references') as $reference) // References - $reference->outertext = ''; + $reference->remove(); return str_replace('href="/', 'href="' . $this->getURI() . '/', $content->innertext); } diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 1589c723..18045559 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -50,7 +50,7 @@ class WordPressBridge extends FeedExpander { foreach ($article->find('h1.entry-title') as $title) if ($title->plaintext == $item['title']) - $title->outertext = ''; + $title->remove(); $article_image = $article_html->find('img.wp-post-image', 0); if(!empty($item['content']) && (!is_object($article_image) || empty($article_image->src))) { diff --git a/bridges/WorldOfTanksBridge.php b/bridges/WorldOfTanksBridge.php index 46dd588d..a5fa0446 100644 --- a/bridges/WorldOfTanksBridge.php +++ b/bridges/WorldOfTanksBridge.php @@ -44,7 +44,7 @@ class WorldOfTanksBridge extends FeedExpander { // Remove the scripts, please foreach($content->find('script') as $script) { - $script->outertext = ''; + $script->remove(); } return $content->innertext; diff --git a/bridges/XenForoBridge.php b/bridges/XenForoBridge.php index 7bf1f15d..dc3a1a5e 100644 --- a/bridges/XenForoBridge.php +++ b/bridges/XenForoBridge.php @@ -193,7 +193,7 @@ class XenForoBridge extends BridgeAbstract { // Remove script tags foreach($content->find('script') as $script) { - $script->outertext = ''; + $script->remove(); } $item['content'] = $content->innertext; diff --git a/lib/html.php b/lib/html.php index 13db97a4..49c77f04 100644 --- a/lib/html.php +++ b/lib/html.php @@ -36,7 +36,7 @@ function sanitize($html, if(in_array($element->tag, $text_to_keep)) { $element->outertext = $element->plaintext; } elseif(in_array($element->tag, $tags_to_remove)) { - $element->outertext = ''; + $element->remove(); } else { foreach($element->getAllAttributes() as $attributeName => $attribute) { if(!in_array($attributeName, $attributes_to_keep)) From 15c374e317d2c02d92acb863d5611121a1399318 Mon Sep 17 00:00:00 2001 From: Eugene Molotov Date: Sun, 2 Jun 2019 00:35:18 +0500 Subject: [PATCH 14/42] [PikabuBridge] More options and fixes (#1149) * Add gif support * Use page title as feed title * Implement community support --- bridges/PikabuBridge.php | 49 ++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/bridges/PikabuBridge.php b/bridges/PikabuBridge.php index 1e1d5c8e..8573e6b6 100644 --- a/bridges/PikabuBridge.php +++ b/bridges/PikabuBridge.php @@ -6,6 +6,16 @@ class PikabuBridge extends BridgeAbstract { const DESCRIPTION = 'Выводит посты по тегу'; const MAINTAINER = 'em92'; + const PARAMETERS_FILTER = array( + 'name' => 'Фильтр', + 'type' => 'list', + 'values' => array( + 'Горячее' => 'hot', + 'Свежее' => 'new', + ), + 'defaultValue' => 'hot' + ); + const PARAMETERS = array( 'По тегу' => array( 'tag' => array( @@ -13,21 +23,29 @@ class PikabuBridge extends BridgeAbstract { 'exampleValue' => 'it', 'required' => true ), - 'filter' => array( - 'name' => 'Фильтр', - 'type' => 'list', - 'values' => array( - 'Горячее' => 'hot', - 'Свежее' => 'new', - ), - 'defaultValue' => 'hot' - ) + 'filter' => self::PARAMETERS_FILTER + ), + 'По сообществу' => array( + 'community' => array( + 'name' => 'Сообщество', + 'exampleValue' => 'linux', + 'required' => true + ), + 'filter' => self::PARAMETERS_FILTER ) ); + protected $title = null; + public function getURI() { if ($this->getInput('tag')) { return self::URI . '/tag/' . rawurlencode($this->getInput('tag')) . '/' . rawurlencode($this->getInput('filter')); + } else if ($this->getInput('community')) { + $uri = self::URI . '/community/' . rawurlencode($this->getInput('community')); + if ($this->getInput('filter') != 'hot') { + $uri .= '/' . rawurlencode($this->getInput('filter')); + } + return $uri; } else { return parent::getURI(); } @@ -38,10 +56,10 @@ class PikabuBridge extends BridgeAbstract { } public function getName() { - if (is_string($this->getInput('tag'))) { - return $this->getInput('tag') . ' - ' . parent::getName(); - } else { + if (is_null($this->title)) { return parent::getName(); + } else { + return $this->title . ' - ' . parent::getName(); } } @@ -52,6 +70,8 @@ class PikabuBridge extends BridgeAbstract { $text_html = iconv('windows-1251', 'utf-8', $text_html); $html = str_get_html($text_html); + $this->title = $html->find('title', 0)->innertext; + foreach($html->find('article.story') as $post) { $time = $post->find('time.story__datetime', 0); if (is_null($time)) continue; @@ -67,6 +87,11 @@ class PikabuBridge extends BridgeAbstract { } } + foreach($post->find('[data-type=gifx]') as $el) { + $src = $el->getAttribute('data-source'); + $el->outertext = ''; + } + foreach($post->find('img') as $img) { $src = $img->getAttribute('src'); if (!$src) { From 82a9bb5b1ccff958902b0aea09af852e1e53805d Mon Sep 17 00:00:00 2001 From: LogMANOriginal Date: Sat, 1 Jun 2019 22:22:05 +0200 Subject: [PATCH 15/42] [.github] Update issue template for bridge requests * Automatically label bridge requests * Propose default title for new bridge requests --- .github/ISSUE_TEMPLATE/bridge-request-template.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/ISSUE_TEMPLATE/bridge-request-template.md b/.github/ISSUE_TEMPLATE/bridge-request-template.md index f4b1119f..344805ce 100644 --- a/.github/ISSUE_TEMPLATE/bridge-request-template.md +++ b/.github/ISSUE_TEMPLATE/bridge-request-template.md @@ -1,6 +1,9 @@ --- name: Bridge request template about: Use this template for requesting a new bridge +title: Bridge request for ... +labels: Bridge-Request +assignees: '' --- From ed539bacf95a71164f24921d75eaf1ec248ee4d4 Mon Sep 17 00:00:00 2001 From: LogMANOriginal Date: Sat, 1 Jun 2019 22:35:33 +0200 Subject: [PATCH 16/42] Add issue template for generic bug reports This commit adds a new template for generic bug reports based on the standard template provided by GitHub. --- .github/ISSUE_TEMPLATE/bug_report.md | 38 ++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..70d49ffb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,38 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: Bug-Report +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Screenshots** +If applicable, add screenshots to help explain your problem. + +**Desktop (please complete the following information):** + - OS: [e.g. iOS] + - Browser [e.g. chrome, safari] + - Version [e.g. 22] + +**Smartphone (please complete the following information):** + - Device: [e.g. iPhone6] + - OS: [e.g. iOS8.1] + - Browser [e.g. stock browser, safari] + - Version [e.g. 22] + +**Additional context** +Add any other context about the problem here. From 468d8be72d34f0a94e5ae4090b3321c5bda4db35 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 1 Jun 2019 22:32:41 +0200 Subject: [PATCH 17/42] [Exceptions] Fix GitHub query labels for bug reports All bug reports now use the Bridge-Broken label by default --- lib/Exceptions.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Exceptions.php b/lib/Exceptions.php index ac452d02..112580de 100644 --- a/lib/Exceptions.php +++ b/lib/Exceptions.php @@ -83,7 +83,7 @@ function buildBridgeException($e, $bridge){ . '`'; $body_html = nl2br($body); - $link = buildGitHubIssueQuery($title, $body, 'bug report', $bridge->getMaintainer()); + $link = buildGitHubIssueQuery($title, $body, 'Bridge-Broken', $bridge->getMaintainer()); $header = buildHeader($e, $bridge); $message = <<getMaintainer()); + $link = buildGitHubIssueQuery($title, $body, 'Bridge-Broken', $bridge->getMaintainer()); $header = buildHeader($e, $bridge); $message = "RSS-Bridge was unable to transform the contents returned by {$bridge->getName()}!"; From 6c4098d6558c33a5fcb2a8bc9fb29e915d56fc6c Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sun, 2 Jun 2019 13:03:26 +0200 Subject: [PATCH 18/42] Revert "all: Use ->remove() instead of ->outertext = ''" This reverts commit 052844f5e13c71ceefd743136a71f71226a0eefb. There is a bug in ->remove() that causes the parser to incorrectly identify elements in the DOM tree that shouldn't exist anymore. References #1151 --- bridges/AsahiShimbunAJWBridge.php | 6 +++--- bridges/BundesbankBridge.php | 2 +- bridges/CastorusBridge.php | 2 +- bridges/DauphineLibereBridge.php | 2 +- bridges/EconomistBridge.php | 6 +++--- bridges/FacebookBridge.php | 2 +- bridges/HaveIBeenPwnedBridge.php | 2 +- bridges/JustETFBridge.php | 8 ++++---- bridges/NextgovBridge.php | 2 +- bridges/OsmAndBlogBridge.php | 2 +- bridges/PikabuBridge.php | 2 +- bridges/RadioMelodieBridge.php | 10 +++++++--- bridges/SIMARBridge.php | 2 +- bridges/ScmbBridge.php | 2 +- bridges/TwitterBridge.php | 2 +- bridges/VkBridge.php | 27 ++++++++++++++------------- bridges/WikipediaBridge.php | 6 +++--- bridges/WordPressBridge.php | 2 +- bridges/WorldOfTanksBridge.php | 2 +- bridges/XenForoBridge.php | 2 +- lib/html.php | 2 +- 21 files changed, 49 insertions(+), 44 deletions(-) diff --git a/bridges/AsahiShimbunAJWBridge.php b/bridges/AsahiShimbunAJWBridge.php index 62b9739d..0ceb0381 100644 --- a/bridges/AsahiShimbunAJWBridge.php +++ b/bridges/AsahiShimbunAJWBridge.php @@ -50,18 +50,18 @@ class AsahiShimbunAJWBridge extends BridgeAbstract { $e_lead = $element->find('span.Lead', 0); if ($e_lead) { $item['content'] = $e_lead->innertext; - $e_lead->remove(); + $e_lead->outertext = ''; } else { $item['content'] = $element->innertext; } $e_date = $element->find('span.EnDate', 0); if ($e_date) { $item['timestamp'] = strtotime($e_date->innertext); - $e_date->remove(); + $e_date->outertext = ''; } $e_video = $element->find('span.EnVideo', 0); if ($e_video) { - $e_video->remove(); + $e_video->outertext = ''; $element->innertext = "VIDEO: $element->innertext"; } $item['title'] = $element->innertext; diff --git a/bridges/BundesbankBridge.php b/bridges/BundesbankBridge.php index d78873c6..b64a6425 100644 --- a/bridges/BundesbankBridge.php +++ b/bridges/BundesbankBridge.php @@ -55,7 +55,7 @@ class BundesbankBridge extends BridgeAbstract { $title = $study->find('.teasable__title div.h2', 0); foreach($title->children as &$child) { - $child->remove(); + $child->outertext = ''; } $item['title'] = $title->innertext; diff --git a/bridges/CastorusBridge.php b/bridges/CastorusBridge.php index 48af9696..3ed1331e 100644 --- a/bridges/CastorusBridge.php +++ b/bridges/CastorusBridge.php @@ -58,7 +58,7 @@ class CastorusBridge extends BridgeAbstract { returnServerError('Cannot find nodes!'); foreach($nodes as $node) { - $node->remove(); + $node->outertext = ''; } return strtotime($activity->innertext); diff --git a/bridges/DauphineLibereBridge.php b/bridges/DauphineLibereBridge.php index 1ff25106..20c82070 100644 --- a/bridges/DauphineLibereBridge.php +++ b/bridges/DauphineLibereBridge.php @@ -50,7 +50,7 @@ class DauphineLibereBridge extends FeedExpander { private function extractContent($url){ $html2 = getSimpleHTMLDOMCached($url); foreach ($html2->find('.noprint, link, script, iframe, .shareTool, .contentInfo') as $remove) { - $remove->remove(); + $remove->outertext = ''; } return $html2->find('div.content', 0)->innertext; } diff --git a/bridges/EconomistBridge.php b/bridges/EconomistBridge.php index 19b2a832..1256be45 100644 --- a/bridges/EconomistBridge.php +++ b/bridges/EconomistBridge.php @@ -29,16 +29,16 @@ class EconomistBridge extends BridgeAbstract { // Remove newsletter subscription box $newsletter = $content->find('div[class="newsletter-form__message"]', 0); if ($newsletter) - $newsletter->remove(); + $newsletter->outertext = ''; $newsletterForm = $content->find('form', 0); if ($newsletterForm) - $newsletterForm->remove(); + $newsletterForm->outertext = ''; // Remove next and previous article URLs at the bottom $nextprev = $content->find('div[class="blog-post__next-previous-wrapper"]', 0); if ($nextprev) - $nextprev->remove(); + $nextprev->outertext = ''; $section = [ $article->find('h3[itemprop="articleSection"]', 0)->plaintext ]; diff --git a/bridges/FacebookBridge.php b/bridges/FacebookBridge.php index a0331da9..c0901072 100644 --- a/bridges/FacebookBridge.php +++ b/bridges/FacebookBridge.php @@ -584,7 +584,7 @@ EOD; foreach($content_filters as $filter) { foreach($content->find($filter) as $subject) { - $subject->remove(); + $subject->outertext = ''; } } diff --git a/bridges/HaveIBeenPwnedBridge.php b/bridges/HaveIBeenPwnedBridge.php index 8fac1e33..f256623a 100644 --- a/bridges/HaveIBeenPwnedBridge.php +++ b/bridges/HaveIBeenPwnedBridge.php @@ -50,7 +50,7 @@ class HaveIBeenPwnedBridge extends BridgeAbstract { $permalink = $breach->find('p', 1)->find('a', 0)->href; // Remove permalink - $breach->find('p', 1)->find('a', 0)->remove(); + $breach->find('p', 1)->find('a', 0)->outertext = ''; $item['title'] = $breach->find('h3', 0)->plaintext . ' - ' . $accounts[1] . ' breached accounts'; $item['dateAdded'] = strtotime($dateAdded[1]); diff --git a/bridges/JustETFBridge.php b/bridges/JustETFBridge.php index c9201e4b..8d5b3d5a 100644 --- a/bridges/JustETFBridge.php +++ b/bridges/JustETFBridge.php @@ -239,16 +239,16 @@ class JustETFBridge extends BridgeAbstract { or returnServerError('Article body not found!'); // Remove teaser image - $element->find('img.teaser-img', 0)->remove(); + $element->find('img.teaser-img', 0)->outertext = ''; // Remove self advertisements foreach($element->find('.call-action') as $adv) { - $adv->remove(); + $adv->outertext = ''; } // Remove tips foreach($element->find('.panel-edu') as $tip) { - $tip->remove(); + $tip->outertext = ''; } // Remove inline scripts (used for i.e. interactive graphs) as they are @@ -318,7 +318,7 @@ class JustETFBridge extends BridgeAbstract { $description = $description->parent(); foreach($description->find('div') as $div) { - $div->remove(); + $div->outertext = ''; } $quote = $html->find('div.infobox div.val', 0) diff --git a/bridges/NextgovBridge.php b/bridges/NextgovBridge.php index 5e393457..74bfc54a 100644 --- a/bridges/NextgovBridge.php +++ b/bridges/NextgovBridge.php @@ -61,7 +61,7 @@ class NextgovBridge extends FeedExpander { return 'Could not request Nextgov: ' . $url; $contents = $article->find('div.wysiwyg', 0); - $contents->find('svg.content-tombstone', 0)->remove(); + $contents->find('svg.content-tombstone', 0)->outertext = ''; $contents = $contents->innertext; $contents = stripWithDelimiters($contents, '
', '
'); $contents = stripWithDelimiters($contents, ''); //ad outer div diff --git a/bridges/OsmAndBlogBridge.php b/bridges/OsmAndBlogBridge.php index 25e765f5..402c0301 100644 --- a/bridges/OsmAndBlogBridge.php +++ b/bridges/OsmAndBlogBridge.php @@ -51,7 +51,7 @@ class OsmAndBlogBridge extends BridgeAbstract { private function cleanupContent($content, ...$removeItems) { foreach ($removeItems as $obj) { - if ($obj) $obj->remove(); + if ($obj) $obj->outertext = ''; } foreach ($content->find('img') as $obj) { $obj->src = $this->filterURL($obj->src); diff --git a/bridges/PikabuBridge.php b/bridges/PikabuBridge.php index 8573e6b6..362b87dc 100644 --- a/bridges/PikabuBridge.php +++ b/bridges/PikabuBridge.php @@ -83,7 +83,7 @@ class PikabuBridge extends BridgeAbstract { foreach($el_to_remove_selectors as $el_to_remove_selector) { foreach($post->find($el_to_remove_selector) as $el) { - $el->remove(); + $el->outertext = ''; } } diff --git a/bridges/RadioMelodieBridge.php b/bridges/RadioMelodieBridge.php index 8e2cf05d..fb5aca6e 100644 --- a/bridges/RadioMelodieBridge.php +++ b/bridges/RadioMelodieBridge.php @@ -38,17 +38,20 @@ class RadioMelodieBridge extends BridgeAbstract { $imgs = $textDOM->find('img[src^="http://www.radiomelodie.com/image.php]'); foreach($imgs as $img) { $img->src = $this->rewriteImage($img->src); + $article->save(); } // Remove Google Ads $ads = $article->find('div[class=adInline]'); foreach($ads as $ad) { - $ad->remove(); + $ad->outertext = ''; + $article->save(); } // Remove Radio Melodie Logo $logoHTML = $article->find('div[id=logoArticleRM]', 0); - $logoHTML->remove(); + $logoHTML->outertext = ''; + $article->save(); $author = $article->find('p[class=AuthorName]', 0)->plaintext; @@ -62,7 +65,8 @@ class RadioMelodieBridge extends BridgeAbstract { $header = ''; // Remove the Date and Author part - $textDOM->find('div[class=AuthorDate]', 0)->remove(); + $textDOM->find('div[class=AuthorDate]', 0)->outertext = ''; + $article->save(); $text = $textDOM->innertext; $item['content'] = '

' . $item['title'] . '

' . $date . '
' . $header . $text; $this->items[] = $item; diff --git a/bridges/SIMARBridge.php b/bridges/SIMARBridge.php index 41d517b4..1e446cf5 100644 --- a/bridges/SIMARBridge.php +++ b/bridges/SIMARBridge.php @@ -48,7 +48,7 @@ class SIMARBridge extends BridgeAbstract { foreach($e_item->find('p') as $paragraph) { /* Remove empty paragraphs */ if (preg_match('/^(\W| )+$/', $paragraph->innertext) == 1) { - $paragraph->remove(); + $paragraph->outertext = ''; } } if ($e_item) { diff --git a/bridges/ScmbBridge.php b/bridges/ScmbBridge.php index 65fbbf01..2107aa3d 100644 --- a/bridges/ScmbBridge.php +++ b/bridges/ScmbBridge.php @@ -18,7 +18,7 @@ class ScmbBridge extends BridgeAbstract { $item['title'] = $article->find('header h1 a', 0)->innertext; // remove text "En savoir plus" from anecdote content - $article->find('span.read-more', 0)->remove(); + $article->find('span.read-more', 0)->outertext = ''; $content = $article->find('p.summary a', 0)->innertext; // remove superfluous spaces at the end diff --git a/bridges/TwitterBridge.php b/bridges/TwitterBridge.php index f3ba39c1..b3b7bed4 100644 --- a/bridges/TwitterBridge.php +++ b/bridges/TwitterBridge.php @@ -171,7 +171,7 @@ class TwitterBridge extends BridgeAbstract { // remove 'invisible' content foreach($tweet->find('.invisible') as $invisible) { - $invisible->remove(); + $invisible->outertext = ''; } // Skip protmoted tweets diff --git a/bridges/VkBridge.php b/bridges/VkBridge.php index 5274180f..8653e7c9 100644 --- a/bridges/VkBridge.php +++ b/bridges/VkBridge.php @@ -62,8 +62,9 @@ class VkBridge extends BridgeAbstract $this->pageName = htmlspecialchars_decode($pageName); } foreach ($html->find('div.replies') as $comment_block) { - $comment_block->remove(); + $comment_block->outertext = ''; } + $html->load($html->save()); $pinned_post_item = null; $last_post_id = 0; @@ -81,7 +82,7 @@ class VkBridge extends BridgeAbstract if (is_object($post->find('a.wall_post_more', 0))) { //delete link "show full" in content - $post->find('a.wall_post_more', 0)->remove(); + $post->find('a.wall_post_more', 0)->outertext = ''; } $content_suffix = ''; @@ -113,7 +114,7 @@ class VkBridge extends BridgeAbstract foreach($external_link_selectors_to_remove as $sel) { if (is_object($post->find($sel, 0))) { - $post->find($sel, 0)->remove(); + $post->find($sel, 0)->outertext = ''; } } @@ -139,7 +140,7 @@ class VkBridge extends BridgeAbstract $content_suffix .= "
"; } $content_suffix .= "
Article: $article_title ($article_author)"; - $article->remove(); + $article->outertext = ''; } // get video on post @@ -149,7 +150,7 @@ class VkBridge extends BridgeAbstract $video_title = $video->find('div.post_video_title', 0)->plaintext; $video_link = $video->find('a.lnk', 0)->getAttribute('href'); $this->appendVideo($video_title, $video_link, $content_suffix, $post_videos); - $video->remove(); + $video->outertext = ''; $main_video_link = $video_link; } @@ -160,14 +161,14 @@ class VkBridge extends BridgeAbstract if (count($temp) > 1) $video_title = $temp[1]; $video_link = $a->getAttribute('href'); if ($video_link != $main_video_link) $this->appendVideo($video_title, $video_link, $content_suffix, $post_videos); - $a->remove(); + $a->outertext = ''; } // get all photos foreach($post->find('div.wall_text > a.page_post_thumb_wrap') as $a) { $result = $this->getPhoto($a); if ($result == null) continue; - $a->remove(); + $a->outertext = ''; $content_suffix .= "
$result"; } @@ -176,7 +177,7 @@ class VkBridge extends BridgeAbstract $a = $el->find('.page_album_link', 0); $album_title = $a->find('.page_album_title_text', 0)->getAttribute('title'); $album_link = $a->getAttribute('href'); - $el->remove(); + $el->outertext = ''; $content_suffix .= "
Album: $album_title"; } @@ -199,7 +200,7 @@ class VkBridge extends BridgeAbstract } - $a->remove(); + $a->outertext = ''; } // get other documents @@ -216,7 +217,7 @@ class VkBridge extends BridgeAbstract } - $div->remove(); + $div->outertext = ''; } // get polls @@ -226,14 +227,14 @@ class VkBridge extends BridgeAbstract foreach($div->find('div.page_poll_text') as $poll_stat_title) { $content_suffix .= '
- ' . $poll_stat_title->innertext; } - $div->remove(); + $div->outertext = ''; } // get sign $post_author = $pageName; foreach($post->find('a.wall_signed_by') as $a) { $post_author = $a->innertext; - $a->remove(); + $a->outertext = ''; } if (is_object($post->find('div.copy_quote', 0))) { @@ -242,7 +243,7 @@ class VkBridge extends BridgeAbstract } $copy_quote = $post->find('div.copy_quote', 0); if ($copy_post_header = $copy_quote->find('div.copy_post_header', 0)) { - $copy_post_header->remove(); + $copy_post_header->outertext = ''; } $copy_quote_content = $copy_quote->innertext; $copy_quote->outertext = "
Reposted:
$copy_quote_content"; diff --git a/bridges/WikipediaBridge.php b/bridges/WikipediaBridge.php index a53652dd..7ca763fc 100644 --- a/bridges/WikipediaBridge.php +++ b/bridges/WikipediaBridge.php @@ -141,7 +141,7 @@ class WikipediaBridge extends BridgeAbstract { $anchorFallbackIndex = 0){ // Clean the bottom of the featured article if ($element->find('div', -1)) - $element->find('div', -1)->remove(); + $element->find('div', -1)->outertext = ''; // The title and URI of the article can be found in an anchor containing // the string '...' in most wikis ('full article ...') @@ -202,10 +202,10 @@ class WikipediaBridge extends BridgeAbstract { // Let's remove a couple of things from the article $table = $content->find('#toc', 0); // Table of contents if(!$table === false) - $table->remove(); + $table->outertext = ''; foreach($content->find('ol.references') as $reference) // References - $reference->remove(); + $reference->outertext = ''; return str_replace('href="/', 'href="' . $this->getURI() . '/', $content->innertext); } diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 18045559..1589c723 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -50,7 +50,7 @@ class WordPressBridge extends FeedExpander { foreach ($article->find('h1.entry-title') as $title) if ($title->plaintext == $item['title']) - $title->remove(); + $title->outertext = ''; $article_image = $article_html->find('img.wp-post-image', 0); if(!empty($item['content']) && (!is_object($article_image) || empty($article_image->src))) { diff --git a/bridges/WorldOfTanksBridge.php b/bridges/WorldOfTanksBridge.php index a5fa0446..46dd588d 100644 --- a/bridges/WorldOfTanksBridge.php +++ b/bridges/WorldOfTanksBridge.php @@ -44,7 +44,7 @@ class WorldOfTanksBridge extends FeedExpander { // Remove the scripts, please foreach($content->find('script') as $script) { - $script->remove(); + $script->outertext = ''; } return $content->innertext; diff --git a/bridges/XenForoBridge.php b/bridges/XenForoBridge.php index dc3a1a5e..7bf1f15d 100644 --- a/bridges/XenForoBridge.php +++ b/bridges/XenForoBridge.php @@ -193,7 +193,7 @@ class XenForoBridge extends BridgeAbstract { // Remove script tags foreach($content->find('script') as $script) { - $script->remove(); + $script->outertext = ''; } $item['content'] = $content->innertext; diff --git a/lib/html.php b/lib/html.php index 49c77f04..13db97a4 100644 --- a/lib/html.php +++ b/lib/html.php @@ -36,7 +36,7 @@ function sanitize($html, if(in_array($element->tag, $text_to_keep)) { $element->outertext = $element->plaintext; } elseif(in_array($element->tag, $tags_to_remove)) { - $element->remove(); + $element->outertext = ''; } else { foreach($element->getAllAttributes() as $attributeName => $attribute) { if(!in_array($attributeName, $attributes_to_keep)) From 94e4ef8f27ecf682d1717e3455e815fddb60a360 Mon Sep 17 00:00:00 2001 From: LogMANOriginal Date: Thu, 6 Jun 2019 19:54:34 +0200 Subject: [PATCH 19/42] Add template for generic feature requests --- .github/ISSUE_TEMPLATE/feature_request.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 00000000..8a5d06a9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: Feature-Request +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. From 62198ecfa21fe60b279ee10b1c2dab203bea69f6 Mon Sep 17 00:00:00 2001 From: LogMANOriginal Date: Thu, 6 Jun 2019 19:55:57 +0200 Subject: [PATCH 20/42] Rename bridge request template Use the same naming convention for all templates --- .github/ISSUE_TEMPLATE/bridge-request.md | 64 ++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bridge-request.md diff --git a/.github/ISSUE_TEMPLATE/bridge-request.md b/.github/ISSUE_TEMPLATE/bridge-request.md new file mode 100644 index 00000000..a0080b8b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bridge-request.md @@ -0,0 +1,64 @@ +--- +name: Bridge request +about: Use this template for requesting a new bridge +title: Bridge request for ... +labels: Bridge-Request +assignees: '' + +--- + +# Bridge request + + + +## General information + + + +- _Host URI for the bridge_ (i.e. `https://github.com`): + +- Which information would you like to see? + + + +- How should the information be displayed/formatted? + + + +- Which of the following parameters do you expect? + + - [X] Title + - [X] URI (link to the original article) + - [ ] Author + - [ ] Timestamp + - [X] Content (the content of the article) + - [ ] Enclosures (pictures, videos, etc...) + - [ ] Categories (categories, tags, etc...) + +## Options + + + +- [ ] Limit number of returned items + - _Default limit_: 5 +- [ ] Load full articles + - _Cache articles_ (articles are stored in a local cache on first request): yes + - _Cache timeout_ (max = 24 hours): 24 hours +- [X] Balance requests (RSS-Bridge uses cached versions to reduce bandwith usage) + - _Timeout_ (default = 5 minutes, max = 24 hours): 5 minutes + + + + From d89326fe2df71479240a966aee0551093790ee4b Mon Sep 17 00:00:00 2001 From: LogMANOriginal Date: Thu, 6 Jun 2019 19:57:04 +0200 Subject: [PATCH 21/42] Remove old bridge request template --- .../ISSUE_TEMPLATE/bridge-request-template.md | 64 ------------------- 1 file changed, 64 deletions(-) delete mode 100644 .github/ISSUE_TEMPLATE/bridge-request-template.md diff --git a/.github/ISSUE_TEMPLATE/bridge-request-template.md b/.github/ISSUE_TEMPLATE/bridge-request-template.md deleted file mode 100644 index 344805ce..00000000 --- a/.github/ISSUE_TEMPLATE/bridge-request-template.md +++ /dev/null @@ -1,64 +0,0 @@ ---- -name: Bridge request template -about: Use this template for requesting a new bridge -title: Bridge request for ... -labels: Bridge-Request -assignees: '' - ---- - -# Bridge request - - - -## General information - - - -- _Host URI for the bridge_ (i.e. `https://github.com`): - -- Which information would you like to see? - - - -- How should the information be displayed/formatted? - - - -- Which of the following parameters do you expect? - - - [X] Title - - [X] URI (link to the original article) - - [ ] Author - - [ ] Timestamp - - [X] Content (the content of the article) - - [ ] Enclosures (pictures, videos, etc...) - - [ ] Categories (categories, tags, etc...) - -## Options - - - -- [ ] Limit number of returned items - - _Default limit_: 5 -- [ ] Load full articles - - _Cache articles_ (articles are stored in a local cache on first request): yes - - _Cache timeout_ (max = 24 hours): 24 hours -- [X] Balance requests (RSS-Bridge uses cached versions to reduce bandwith usage) - - _Timeout_ (default = 5 minutes, max = 24 hours): 5 minutes - - - - From 1814116d67a912bf6b31aab4718531157615aae2 Mon Sep 17 00:00:00 2001 From: Antoine Cadoret Date: Thu, 6 Jun 2019 19:59:30 +0200 Subject: [PATCH 22/42] [SteamBridge] Follow source changes (#1143) * Follow source data fetching changes * Improve media path building * Improve price fetching and display --- bridges/SteamBridge.php | 91 +++++++++++++---------------------------- 1 file changed, 28 insertions(+), 63 deletions(-) diff --git a/bridges/SteamBridge.php b/bridges/SteamBridge.php index 8ff456d7..d0acd6da 100644 --- a/bridges/SteamBridge.php +++ b/bridges/SteamBridge.php @@ -8,44 +8,12 @@ class SteamBridge extends BridgeAbstract { const MAINTAINER = 'jacknumber'; const PARAMETERS = array( 'Wishlist' => array( - 'username' => array( - 'name' => 'Username', + 'userid' => array( + 'name' => 'Steamid64 (find it on steamid.io)', + 'title' => 'User ID (17 digits). Find your user ID with steamid.io or steamidfinder.com', 'required' => true, - ), - 'currency' => array( - 'name' => 'Currency', - 'type' => 'list', - 'values' => array( - // source: http://steam.steamlytics.xyz/currencies - 'USD' => 'us', - 'GBP' => 'gb', - 'EUR' => 'fr', - 'CHF' => 'ch', - 'RUB' => 'ru', - 'BRL' => 'br', - 'JPY' => 'jp', - 'SEK' => 'se', - 'IDR' => 'id', - 'MYR' => 'my', - 'PHP' => 'ph', - 'SGD' => 'sg', - 'THB' => 'th', - 'KRW' => 'kr', - 'TRY' => 'tr', - 'MXN' => 'mx', - 'CAD' => 'ca', - 'NZD' => 'nz', - 'CNY' => 'cn', - 'INR' => 'in', - 'CLP' => 'cl', - 'PEN' => 'pe', - 'COP' => 'co', - 'ZAR' => 'za', - 'HKD' => 'hk', - 'TWD' => 'tw', - 'SRD' => 'sr', - 'AED' => 'ae', - ), + 'exampleValue' => '76561198821231205', + 'pattern' => '[0-9]{17}', ), 'only_discount' => array( 'name' => 'Only discount', @@ -56,27 +24,15 @@ class SteamBridge extends BridgeAbstract { public function collectData(){ - $username = $this->getInput('username'); - $params = array( - 'cc' => $this->getInput('currency') - ); + $userid = $this->getInput('userid'); - $url = self::URI . 'wishlist/id/' . $username . '?' . http_build_query($params); - - $targetVariable = 'g_rgAppInfo'; + $sourceUrl = self::URI . 'wishlist/profiles/' . $userid . '/wishlistdata?p=0'; $sort = array(); - $html = ''; - $html = getSimpleHTMLDOM($url) - or returnServerError("Could not request Steam Wishlist. Tried:\n - $url"); + $json = getContents($sourceUrl) + or returnServerError('Could not get content from wishlistdata (' . $sourceUrl . ')'); - $jsContent = $html->find('.responsive_page_template_content script', 0)->innertext; - - if(preg_match('/var ' . $targetVariable . ' = (.*?);/s', $jsContent, $matches)) { - $appsData = json_decode($matches[1]); - } else { - returnServerError("Could not parse JS variable ($targetVariable) in page content."); - } + $appsData = json_decode($json); foreach($appsData as $id => $element) { @@ -87,6 +43,8 @@ class SteamBridge extends BridgeAbstract { if($element->subs) { $appIsBuyable = 1; + $priceBlock = str_get_html($element->subs[0]->discount_block); + $appPrice = str_replace('--', '00', $priceBlock->find('.discount_final_price', 0)->plaintext); if($element->subs[0]->discount_pct) { @@ -94,8 +52,6 @@ class SteamBridge extends BridgeAbstract { $discountBlock = str_get_html($element->subs[0]->discount_block); $appDiscountValue = $discountBlock->find('.discount_pct', 0)->plaintext; $appOldPrice = $discountBlock->find('.discount_original_price', 0)->plaintext; - $appNewPrice = $discountBlock->find('.discount_final_price', 0)->plaintext; - $appPrice = $appNewPrice; } else { @@ -103,7 +59,6 @@ class SteamBridge extends BridgeAbstract { continue; } - $appPrice = $element->subs[0]->price / 100; } } else { @@ -117,11 +72,14 @@ class SteamBridge extends BridgeAbstract { } } + $coverUrl = str_replace('_292x136', '', strtok($element->capsule, '?')); + $picturesPath = pathinfo($coverUrl)['dirname'] . '/'; + $item = array(); $item['uri'] = "http://store.steampowered.com/app/$id/"; $item['title'] = $element->name; $item['type'] = $appType; - $item['cover'] = str_replace('_292x136', '', $element->capsule); + $item['cover'] = $coverUrl; $item['timestamp'] = $element->added; $item['isBuyable'] = $appIsBuyable; $item['hasDiscount'] = $appHasDiscount; @@ -129,22 +87,29 @@ class SteamBridge extends BridgeAbstract { $item['priority'] = $element->priority; if($appIsBuyable) { + $item['price'] = floatval(str_replace(',', '.', $appPrice)); + $item['content'] = $appPrice; + + } + + if($appIsFree) { + $item['content'] = 'Free'; } if($appHasDiscount) { $item['discount']['value'] = $appDiscountValue; - $item['discount']['oldPrice'] = floatval(str_replace(',', '.', $appOldPrice)); - $item['discount']['newPrice'] = floatval(str_replace(',', '.', $appNewPrice)); + $item['discount']['oldPrice'] = $appOldPrice; + $item['content'] = '' . $appOldPrice . ' ' . $appPrice . ' (' . $appDiscountValue . ')'; } $item['enclosures'] = array(); - $item['enclosures'][] = str_replace('_292x136', '', $element->capsule); + $item['enclosures'][] = $coverUrl; - foreach($element->screenshots as $screenshot) { - $item['enclosures'][] = substr($element->capsule, 0, -31) . $screenshot; + foreach($element->screenshots as $screenshotFileName) { + $item['enclosures'][] = $picturesPath . $screenshotFileName; } $sort[$id] = $element->priority; From b0a780acda5b4bc86327c7461a9f21628cb32b0f Mon Sep 17 00:00:00 2001 From: Eugene Molotov Date: Thu, 6 Jun 2019 23:05:41 +0500 Subject: [PATCH 23/42] [VkBridge] Ignore illegal characters in input html for iconv (#1154) --- bridges/VkBridge.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bridges/VkBridge.php b/bridges/VkBridge.php index 8653e7c9..f9aaa66a 100644 --- a/bridges/VkBridge.php +++ b/bridges/VkBridge.php @@ -52,7 +52,7 @@ class VkBridge extends BridgeAbstract $text_html = $this->getContents() or returnServerError('No results for group or user name "' . $this->getInput('u') . '".'); - $text_html = iconv('windows-1251', 'utf-8', $text_html); + $text_html = iconv('windows-1251', 'utf-8//ignore', $text_html); // makes album link generating work correctly $text_html = str_replace('"class="page_album_link">', '" class="page_album_link">', $text_html); $html = str_get_html($text_html); From d4e867f2403c6224c93d8588fadbde600aec4444 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Thu, 6 Jun 2019 20:53:44 +0200 Subject: [PATCH 24/42] core: Move default bridges to whitelist.default.txt Default bridges are currently statically defined in index.php, which is not the right place if we want to keep responsibilities separated. This commit introduces a new file whitelist.default.txt that holds the default bridges and which is loaded automatically, if whitelist.txt doesn't exist. Due to this it is also no longer necessary to have write permission for the root directory. References #1001 --- index.php | 19 ------------------- lib/Bridge.php | 25 ++++++++++++++----------- lib/Configuration.php | 4 ---- lib/rssbridge.php | 3 +++ whitelist.default.txt | 15 +++++++++++++++ 5 files changed, 32 insertions(+), 34 deletions(-) create mode 100644 whitelist.default.txt diff --git a/index.php b/index.php index 771e3379..0bc63625 100644 --- a/index.php +++ b/index.php @@ -29,27 +29,8 @@ define('USER_AGENT', ini_set('user_agent', USER_AGENT); -// default whitelist -$whitelist_default = array( - 'BandcampBridge', - 'CryptomeBridge', - 'DansTonChatBridge', - 'DuckDuckGoBridge', - 'FacebookBridge', - 'FlickrBridge', - 'GoogleSearchBridge', - 'IdenticaBridge', - 'InstagramBridge', - 'OpenClassroomsBridge', - 'PinterestBridge', - 'ScmbBridge', - 'TwitterBridge', - 'WikipediaBridge', - 'YoutubeBridge'); - try { - Bridge::setWhitelist($whitelist_default); $actionFac = new \ActionFactory(); $actionFac->setWorkingDir(PATH_LIB_ACTIONS); diff --git a/lib/Bridge.php b/lib/Bridge.php index 9e5750a7..dc42e79b 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -192,7 +192,8 @@ class Bridge { /** * Returns the whitelist. * - * On first call this function reads the whitelist from {@see WHITELIST}. + * On first call this function reads the whitelist from {@see WHITELIST} if + * the file exists, {@see WHITELIST_DEFAULT} otherwise. * * Each line in the file specifies one bridge on the whitelist. * * An empty file disables all bridges. * * If the file only only contains `*`, all bridges are whitelisted. @@ -210,19 +211,21 @@ class Bridge { if($firstCall) { - // Create initial whitelist or load from disk - if (!file_exists(WHITELIST) && !empty(self::$whitelist)) { - file_put_contents(WHITELIST, implode("\n", self::$whitelist)); - } elseif(file_exists(WHITELIST)) { - + if(file_exists(WHITELIST)) { $contents = trim(file_get_contents(WHITELIST)); + } elseif(file_exists(WHITELIST_DEFAULT)) { + $contents = trim(file_get_contents(WHITELIST_DEFAULT)); + } else { + $contents = ''; + } - if($contents === '*') { // Whitelist all bridges - self::$whitelist = self::getBridgeNames(); - } else { - self::$whitelist = array_map('self::sanitizeBridgeName', explode("\n", $contents)); + if($contents === '*') { // Whitelist all bridges + self::$whitelist = self::getBridgeNames(); + } else { + //self::$whitelist = array_map('self::sanitizeBridgeName', explode("\n", $contents)); + foreach(explode("\n", $contents) as $bridgeName) { + self::$whitelist[] = self::sanitizeBridgeName($bridgeName); } - } } diff --git a/lib/Configuration.php b/lib/Configuration.php index be9315c8..cf2fd7c8 100644 --- a/lib/Configuration.php +++ b/lib/Configuration.php @@ -106,10 +106,6 @@ final class Configuration { if(!is_writable(PATH_CACHE)) die('RSS-Bridge does not have write permissions for ' . PATH_CACHE . '!'); - // Check whitelist file permissions - if(!file_exists(WHITELIST) && !is_writable(dirname(WHITELIST))) - die('RSS-Bridge does not have write permissions for ' . WHITELIST . '!'); - } /** diff --git a/lib/rssbridge.php b/lib/rssbridge.php index 5a523588..168c91ca 100644 --- a/lib/rssbridge.php +++ b/lib/rssbridge.php @@ -38,6 +38,9 @@ define('PATH_CACHE', __DIR__ . '/../cache/'); /** Path to the whitelist file */ define('WHITELIST', __DIR__ . '/../whitelist.txt'); +/** Path to the default whitelist file */ +define('WHITELIST_DEFAULT', __DIR__ . '/../whitelist.default.txt'); + /** URL to the RSS-Bridge repository */ define('REPOSITORY', 'https://github.com/RSS-Bridge/rss-bridge/'); diff --git a/whitelist.default.txt b/whitelist.default.txt new file mode 100644 index 00000000..6530c324 --- /dev/null +++ b/whitelist.default.txt @@ -0,0 +1,15 @@ +Bandcamp +Cryptome +DansTonChat +DuckDuckGo +Facebook +Flickr +GoogleSearch +Identica +Instagram +OpenClassrooms +Pinterest +Scmb +Twitter +Wikipedia +Youtube From e2e0ced055c4a0ec59415ca00422b1db7fec68e9 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Thu, 6 Jun 2019 20:59:29 +0200 Subject: [PATCH 25/42] [Bridge] Improve performance for correctly written whitelist.txt If the bridge name matches exactly, it is not necessary to perform a strtolower compare of bridges. In some situations this can lead to much faster response times (depending on the amount of bridges in whitelist.txt). --- lib/Bridge.php | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/Bridge.php b/lib/Bridge.php index dc42e79b..31607922 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -283,6 +283,12 @@ class Bridge { $name = $matches[1]; } + // Improve performance for correctly written bridge names + if(in_array($name, self::getBridgeNames())) { + $index = array_search($name, self::getBridgeNames()); + return self::getBridgeNames()[$index]; + } + // The name is valid if a corresponding bridge file is found on disk if(in_array(strtolower($name), array_map('strtolower', self::getBridgeNames()))) { $index = array_search(strtolower($name), array_map('strtolower', self::getBridgeNames())); From 946a99d3347d0e25c11bb57b563cd5daeaa0aa5b Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Fri, 7 Jun 2019 19:19:36 +0200 Subject: [PATCH 26/42] config: Add [system] => 'timezone' RSS-Bridge currently statically sets the timezone to UTC which can result in incorrect timestamps if the server is hosted in another region. This commit adds a new configuration parameter to allow admins to specify their own timezone for their servers. Invalid values will result in an error message. Example: [system] timezone = "UTC" For compatibility reasons the default value is set to UTC. This parameter accepts any of the supported timezones listed at https://www.php.net/manual/en/timezones.php Closes #956 References #1001 --- config.default.ini.php | 8 ++++++++ lib/Configuration.php | 6 ++++++ 2 files changed, 14 insertions(+) diff --git a/config.default.ini.php b/config.default.ini.php index 5f4a75f6..b7d4fba0 100644 --- a/config.default.ini.php +++ b/config.default.ini.php @@ -4,6 +4,14 @@ ; file, it will be replaced on the next update of RSS-Bridge! You can specify ; your own configuration in 'config.ini.php' (copy this file). +[system] + +; Defines the timezone used by RSS-Bridge +; Find a list of supported timezones at +; https://www.php.net/manual/en/timezones.php +; timezone = "UTC" (default) +timezone = "UTC" + [cache] ; Defines the cache type used by RSS-Bridge diff --git a/lib/Configuration.php b/lib/Configuration.php index cf2fd7c8..c327c1d4 100644 --- a/lib/Configuration.php +++ b/lib/Configuration.php @@ -155,6 +155,12 @@ final class Configuration { } } + if(!is_string(self::getConfig('system', 'timezone')) + || !in_array(self::getConfig('system', 'timezone'), timezone_identifiers_list(DateTimeZone::ALL_WITH_BC))) + die('Parameter [system] => "timezone" is invalid! Please check "config.ini.php"!'); + + date_default_timezone_set(self::getConfig('system', 'timezone')); + if(!is_string(self::getConfig('proxy', 'url'))) die('Parameter [proxy] => "url" is not a valid string! Please check "config.ini.php"!'); From ccf375e9171e9af1c0907a2cce178daf94ea0aab Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Fri, 7 Jun 2019 19:45:47 +0200 Subject: [PATCH 27/42] config: Use global constant for config files The configuration files are currently hard-coded in the configuration classes and error messages. However, the implementation should not rely on specific details like the file name. Instead, the files should be part of the global definition. This commit introduces two global constants for the configuration files - FILE_CONFIG => 'config.ini.php' - FILE_CONFIG_DEFAULT => 'config.default.ini.php' --- caches/MemcachedCache.php | 10 ++++----- caches/SQLiteCache.php | 4 ++-- lib/Configuration.php | 46 +++++++++++++++++++-------------------- lib/rssbridge.php | 6 +++++ 4 files changed, 35 insertions(+), 31 deletions(-) diff --git a/caches/MemcachedCache.php b/caches/MemcachedCache.php index 42291790..f69f10b0 100644 --- a/caches/MemcachedCache.php +++ b/caches/MemcachedCache.php @@ -16,19 +16,19 @@ class MemcachedCache implements CacheInterface { $host = Configuration::getConfig(get_called_class(), 'host'); $port = Configuration::getConfig(get_called_class(), 'port'); if (empty($host) && empty($port)) { - returnServerError('Configuration for ' . get_called_class() . ' missing. Please check your config.ini.php'); + returnServerError('Configuration for ' . get_called_class() . ' missing. Please check your ' . FILE_CONFIG); } else if (empty($host)) { - returnServerError('"host" param is not set for ' . get_called_class() . '. Please check your config.ini.php'); + returnServerError('"host" param is not set for ' . get_called_class() . '. Please check your ' . FILE_CONFIG); } else if (empty($port)) { - returnServerError('"port" param is not set for ' . get_called_class() . '. Please check your config.ini.php'); + returnServerError('"port" param is not set for ' . get_called_class() . '. Please check your ' . FILE_CONFIG); } else if (!ctype_digit($port)) { - returnServerError('"port" param is invalid for ' . get_called_class() . '. Please check your config.ini.php'); + returnServerError('"port" param is invalid for ' . get_called_class() . '. Please check your ' . FILE_CONFIG); } $port = intval($port); if ($port < 1 || $port > 65535) { - returnServerError('"port" param is invalid for ' . get_called_class() . '. Please check your config.ini.php'); + returnServerError('"port" param is invalid for ' . get_called_class() . '. Please check your ' . FILE_CONFIG); } $conn = new Memcached(); diff --git a/caches/SQLiteCache.php b/caches/SQLiteCache.php index 7d0f584f..394e25fa 100644 --- a/caches/SQLiteCache.php +++ b/caches/SQLiteCache.php @@ -15,12 +15,12 @@ class SQLiteCache implements CacheInterface { $file = Configuration::getConfig(get_called_class(), 'file'); if (empty($file)) { - die('Configuration for ' . get_called_class() . ' missing. Please check your config.ini.php'); + die('Configuration for ' . get_called_class() . ' missing. Please check your ' . FILE_CONFIG); } if (dirname($file) == '.') { $file = PATH_CACHE . $file; } elseif (!is_dir(dirname($file))) { - die('Invalid configuration for ' . get_called_class() . '. Please check your config.ini.php'); + die('Invalid configuration for ' . get_called_class() . '. Please check your ' . FILE_CONFIG); } if (!is_file($file)) { diff --git a/lib/Configuration.php b/lib/Configuration.php index c327c1d4..bd82ce94 100644 --- a/lib/Configuration.php +++ b/lib/Configuration.php @@ -114,15 +114,13 @@ final class Configuration { * Returns an error message and aborts execution if the configuration is invalid. * * The RSS-Bridge configuration is split into two files: - * - `config.default.ini.php`: The default configuration file that ships with - * every release of RSS-Bridge (do not modify this file!). - * - `config.ini.php`: The local configuration file that can be modified by - * server administrators. + * - {@see FILE_CONFIG_DEFAULT} The default configuration file that ships + * with every release of RSS-Bridge (do not modify this file!). + * - {@see FILE_CONFIG} The local configuration file that can be modified + * by server administrators. * - * The files must be located at {@see PATH_ROOT} - * - * RSS-Bridge will first load `config.default.ini.php` into memory and then - * replace parameters with the contents of `config.ini.php`. That way new + * RSS-Bridge will first load {@see FILE_CONFIG_DEFAULT} into memory and then + * replace parameters with the contents of {@see FILE_CONFIG}. That way new * parameters are automatically initialized with default values and custom * configurations can be reduced to the minimum set of parametes necessary * (only the ones that changed). @@ -136,16 +134,16 @@ final class Configuration { */ public static function loadConfiguration() { - if(!file_exists(PATH_ROOT . 'config.default.ini.php')) - die('The default configuration file "config.default.ini.php" is missing!'); + if(!file_exists(FILE_CONFIG_DEFAULT)) + die('The default configuration file "' . FILE_CONFIG_DEFAULT . '" is missing!'); - Configuration::$config = parse_ini_file(PATH_ROOT . 'config.default.ini.php', true, INI_SCANNER_TYPED); + Configuration::$config = parse_ini_file(FILE_CONFIG_DEFAULT, true, INI_SCANNER_TYPED); if(!Configuration::$config) - die('Error parsing config.default.ini.php'); + die('Error parsing ' . FILE_CONFIG_DEFAULT); - if(file_exists(PATH_ROOT . 'config.ini.php')) { + if(file_exists(FILE_CONFIG)) { // Replace default configuration with custom settings - foreach(parse_ini_file(PATH_ROOT . 'config.ini.php', true, INI_SCANNER_TYPED) as $header => $section) { + foreach(parse_ini_file(FILE_CONFIG, true, INI_SCANNER_TYPED) as $header => $section) { foreach($section as $key => $value) { // Skip unknown sections and keys if(array_key_exists($header, Configuration::$config) && array_key_exists($key, Configuration::$config[$header])) { @@ -157,12 +155,12 @@ final class Configuration { if(!is_string(self::getConfig('system', 'timezone')) || !in_array(self::getConfig('system', 'timezone'), timezone_identifiers_list(DateTimeZone::ALL_WITH_BC))) - die('Parameter [system] => "timezone" is invalid! Please check "config.ini.php"!'); + die('Parameter [system] => "timezone" is invalid! Please check ' . FILE_CONFIG); date_default_timezone_set(self::getConfig('system', 'timezone')); if(!is_string(self::getConfig('proxy', 'url'))) - die('Parameter [proxy] => "url" is not a valid string! Please check "config.ini.php"!'); + die('Parameter [proxy] => "url" is not a valid string! Please check ' . FILE_CONFIG); if(!empty(self::getConfig('proxy', 'url'))) { /** URL of the proxy server */ @@ -170,38 +168,38 @@ final class Configuration { } if(!is_bool(self::getConfig('proxy', 'by_bridge'))) - die('Parameter [proxy] => "by_bridge" is not a valid Boolean! Please check "config.ini.php"!'); + die('Parameter [proxy] => "by_bridge" is not a valid Boolean! Please check ' . FILE_CONFIG); /** True if proxy usage can be enabled selectively for each bridge */ define('PROXY_BYBRIDGE', self::getConfig('proxy', 'by_bridge')); if(!is_string(self::getConfig('proxy', 'name'))) - die('Parameter [proxy] => "name" is not a valid string! Please check "config.ini.php"!'); + die('Parameter [proxy] => "name" is not a valid string! Please check ' . FILE_CONFIG); /** Name of the proxy server */ define('PROXY_NAME', self::getConfig('proxy', 'name')); if(!is_string(self::getConfig('cache', 'type'))) - die('Parameter [cache] => "type" is not a valid string! Please check "config.ini.php"!'); + die('Parameter [cache] => "type" is not a valid string! Please check ' . FILE_CONFIG); if(!is_bool(self::getConfig('cache', 'custom_timeout'))) - die('Parameter [cache] => "custom_timeout" is not a valid Boolean! Please check "config.ini.php"!'); + die('Parameter [cache] => "custom_timeout" is not a valid Boolean! Please check ' . FILE_CONFIG); /** True if the cache timeout can be specified by the user */ define('CUSTOM_CACHE_TIMEOUT', self::getConfig('cache', 'custom_timeout')); if(!is_bool(self::getConfig('authentication', 'enable'))) - die('Parameter [authentication] => "enable" is not a valid Boolean! Please check "config.ini.php"!'); + die('Parameter [authentication] => "enable" is not a valid Boolean! Please check ' . FILE_CONFIG); if(!is_string(self::getConfig('authentication', 'username'))) - die('Parameter [authentication] => "username" is not a valid string! Please check "config.ini.php"!'); + die('Parameter [authentication] => "username" is not a valid string! Please check ' . FILE_CONFIG); if(!is_string(self::getConfig('authentication', 'password'))) - die('Parameter [authentication] => "password" is not a valid string! Please check "config.ini.php"!'); + die('Parameter [authentication] => "password" is not a valid string! Please check ' . FILE_CONFIG); if(!empty(self::getConfig('admin', 'email')) && !filter_var(self::getConfig('admin', 'email'), FILTER_VALIDATE_EMAIL)) - die('Parameter [admin] => "email" is not a valid email address! Please check "config.ini.php"!'); + die('Parameter [admin] => "email" is not a valid email address! Please check ' . FILE_CONFIG); } diff --git a/lib/rssbridge.php b/lib/rssbridge.php index 168c91ca..f7c1a3d7 100644 --- a/lib/rssbridge.php +++ b/lib/rssbridge.php @@ -41,6 +41,12 @@ define('WHITELIST', __DIR__ . '/../whitelist.txt'); /** Path to the default whitelist file */ define('WHITELIST_DEFAULT', __DIR__ . '/../whitelist.default.txt'); +/** Path to the configuration file */ +define('FILE_CONFIG', PATH_ROOT . 'config.ini.php'); + +/** Path to the default configuration file */ +define('FILE_CONFIG_DEFAULT', PATH_ROOT . 'config.default.ini.php'); + /** URL to the RSS-Bridge repository */ define('REPOSITORY', 'https://github.com/RSS-Bridge/rss-bridge/'); From 0e30468e0f869176859404968e82c4479cffe8b4 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Fri, 7 Jun 2019 19:51:00 +0200 Subject: [PATCH 28/42] [rssbridge] Use PATH_ROOT whenever possible --- lib/rssbridge.php | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/rssbridge.php b/lib/rssbridge.php index f7c1a3d7..3b0e65d4 100644 --- a/lib/rssbridge.php +++ b/lib/rssbridge.php @@ -15,31 +15,31 @@ define('PATH_ROOT', __DIR__ . '/../'); /** Path to the core library */ -define('PATH_LIB', __DIR__ . '/../lib/'); // Path to core library +define('PATH_LIB', PATH_ROOT . 'lib/'); /** Path to the vendor library */ -define('PATH_LIB_VENDOR', __DIR__ . '/../vendor/'); +define('PATH_LIB_VENDOR', PATH_ROOT . 'vendor/'); /** Path to the bridges library */ -define('PATH_LIB_BRIDGES', __DIR__ . '/../bridges/'); +define('PATH_LIB_BRIDGES', PATH_ROOT . 'bridges/'); /** Path to the formats library */ -define('PATH_LIB_FORMATS', __DIR__ . '/../formats/'); +define('PATH_LIB_FORMATS', PATH_ROOT . 'formats/'); /** Path to the caches library */ -define('PATH_LIB_CACHES', __DIR__ . '/../caches/'); +define('PATH_LIB_CACHES', PATH_ROOT . 'caches/'); /** Path to the actions library */ -define('PATH_LIB_ACTIONS', __DIR__ . '/../actions/'); +define('PATH_LIB_ACTIONS', PATH_ROOT . 'actions/'); /** Path to the cache folder */ -define('PATH_CACHE', __DIR__ . '/../cache/'); +define('PATH_CACHE', PATH_ROOT . 'cache/'); /** Path to the whitelist file */ -define('WHITELIST', __DIR__ . '/../whitelist.txt'); +define('WHITELIST', PATH_ROOT . 'whitelist.txt'); /** Path to the default whitelist file */ -define('WHITELIST_DEFAULT', __DIR__ . '/../whitelist.default.txt'); +define('WHITELIST_DEFAULT', PATH_ROOT . 'whitelist.default.txt'); /** Path to the configuration file */ define('FILE_CONFIG', PATH_ROOT . 'config.ini.php'); From 35bd7063915ee07e667d4d8f144ea37ddea50506 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Fri, 7 Jun 2019 20:27:17 +0200 Subject: [PATCH 29/42] [Configuration] Use common format to report errors to the user Incorrect configuration values are currently handled individually for each condition, resulting in a lot of repetitive operations. This commit adds two new private functions to report errors to the user and end execution of the script. --- lib/Configuration.php | 82 ++++++++++++++++++++++++++++++++----------- 1 file changed, 62 insertions(+), 20 deletions(-) diff --git a/lib/Configuration.php b/lib/Configuration.php index bd82ce94..d5d7cac1 100644 --- a/lib/Configuration.php +++ b/lib/Configuration.php @@ -80,31 +80,31 @@ final class Configuration { // Check PHP version if(version_compare(PHP_VERSION, '5.6.0') === -1) - die('RSS-Bridge requires at least PHP version 5.6.0!'); + self::reportError('RSS-Bridge requires at least PHP version 5.6.0!'); // extensions check if(!extension_loaded('openssl')) - die('"openssl" extension not loaded. Please check "php.ini"'); + self::reportError('"openssl" extension not loaded. Please check "php.ini"'); if(!extension_loaded('libxml')) - die('"libxml" extension not loaded. Please check "php.ini"'); + self::reportError('"libxml" extension not loaded. Please check "php.ini"'); if(!extension_loaded('mbstring')) - die('"mbstring" extension not loaded. Please check "php.ini"'); + self::reportError('"mbstring" extension not loaded. Please check "php.ini"'); if(!extension_loaded('simplexml')) - die('"simplexml" extension not loaded. Please check "php.ini"'); + self::reportError('"simplexml" extension not loaded. Please check "php.ini"'); // Allow RSS-Bridge to run without curl module in CLI mode without root certificates if(!extension_loaded('curl') && !(php_sapi_name() === 'cli' && empty(ini_get('curl.cainfo')))) - die('"curl" extension not loaded. Please check "php.ini"'); + self::reportError('"curl" extension not loaded. Please check "php.ini"'); if(!extension_loaded('json')) - die('"json" extension not loaded. Please check "php.ini"'); + self::reportError('"json" extension not loaded. Please check "php.ini"'); // Check cache folder permissions (write permissions required) if(!is_writable(PATH_CACHE)) - die('RSS-Bridge does not have write permissions for ' . PATH_CACHE . '!'); + self::reportError('RSS-Bridge does not have write permissions for ' . PATH_CACHE . '!'); } @@ -135,11 +135,11 @@ final class Configuration { public static function loadConfiguration() { if(!file_exists(FILE_CONFIG_DEFAULT)) - die('The default configuration file "' . FILE_CONFIG_DEFAULT . '" is missing!'); + self::reportError('The default configuration file is missing at ' . FILE_CONFIG_DEFAULT); Configuration::$config = parse_ini_file(FILE_CONFIG_DEFAULT, true, INI_SCANNER_TYPED); if(!Configuration::$config) - die('Error parsing ' . FILE_CONFIG_DEFAULT); + self::reportError('Error parsing ' . FILE_CONFIG_DEFAULT); if(file_exists(FILE_CONFIG)) { // Replace default configuration with custom settings @@ -155,12 +155,12 @@ final class Configuration { if(!is_string(self::getConfig('system', 'timezone')) || !in_array(self::getConfig('system', 'timezone'), timezone_identifiers_list(DateTimeZone::ALL_WITH_BC))) - die('Parameter [system] => "timezone" is invalid! Please check ' . FILE_CONFIG); + self::reportConfigurationError('system', 'timezone'); date_default_timezone_set(self::getConfig('system', 'timezone')); if(!is_string(self::getConfig('proxy', 'url'))) - die('Parameter [proxy] => "url" is not a valid string! Please check ' . FILE_CONFIG); + self::reportConfigurationError('proxy', 'url', 'Is not a valid string'); if(!empty(self::getConfig('proxy', 'url'))) { /** URL of the proxy server */ @@ -168,38 +168,38 @@ final class Configuration { } if(!is_bool(self::getConfig('proxy', 'by_bridge'))) - die('Parameter [proxy] => "by_bridge" is not a valid Boolean! Please check ' . FILE_CONFIG); + self::reportConfigurationError('proxy', 'by_bridge', 'Is not a valid Boolean'); /** True if proxy usage can be enabled selectively for each bridge */ define('PROXY_BYBRIDGE', self::getConfig('proxy', 'by_bridge')); if(!is_string(self::getConfig('proxy', 'name'))) - die('Parameter [proxy] => "name" is not a valid string! Please check ' . FILE_CONFIG); + self::reportConfigurationError('proxy', 'name', 'Is not a valid string'); /** Name of the proxy server */ define('PROXY_NAME', self::getConfig('proxy', 'name')); if(!is_string(self::getConfig('cache', 'type'))) - die('Parameter [cache] => "type" is not a valid string! Please check ' . FILE_CONFIG); + self::reportConfigurationError('cache', 'type', 'Is not a valid string'); if(!is_bool(self::getConfig('cache', 'custom_timeout'))) - die('Parameter [cache] => "custom_timeout" is not a valid Boolean! Please check ' . FILE_CONFIG); + self::reportConfigurationError('cache', 'custom_timeout', 'Is not a valid Boolean'); /** True if the cache timeout can be specified by the user */ define('CUSTOM_CACHE_TIMEOUT', self::getConfig('cache', 'custom_timeout')); if(!is_bool(self::getConfig('authentication', 'enable'))) - die('Parameter [authentication] => "enable" is not a valid Boolean! Please check ' . FILE_CONFIG); + self::reportConfigurationError('authentication', 'enable', 'Is not a valid Boolean'); if(!is_string(self::getConfig('authentication', 'username'))) - die('Parameter [authentication] => "username" is not a valid string! Please check ' . FILE_CONFIG); + self::reportConfigurationError('authentication', 'username', 'Is not a valid string'); if(!is_string(self::getConfig('authentication', 'password'))) - die('Parameter [authentication] => "password" is not a valid string! Please check ' . FILE_CONFIG); + self::reportConfigurationError('authentication', 'password', 'Is not a valid string'); if(!empty(self::getConfig('admin', 'email')) && !filter_var(self::getConfig('admin', 'email'), FILTER_VALIDATE_EMAIL)) - die('Parameter [admin] => "email" is not a valid email address! Please check ' . FILE_CONFIG); + self::reportConfigurationError('admin', 'email', 'Is not a valid email address'); } @@ -246,4 +246,46 @@ final class Configuration { return Configuration::$VERSION; } + + /** + * Reports an configuration error for the specified section and key to the + * user and ends execution + * + * @param string $section The section name + * @param string $key The configuration key + * @param string $message An optional message to the user + * + * @return void + */ + private static function reportConfigurationError($section, $key, $message = '') { + + $report = "Parameter [{$section}] => \"{$key}\" is invalid!" . PHP_EOL; + + if(file_exists(FILE_CONFIG)) { + $report .= 'Please check your configuration file at ' . FILE_CONFIG . PHP_EOL; + } elseif(!file_exists(FILE_CONFIG_DEFAULT)) { + $report .= 'The default configuration file is missing at ' . FILE_CONFIG_DEFAULT . PHP_EOL; + } else { + $report .= 'The default configuration file is broken.' . PHP_EOL + . 'Restore the original file from ' . REPOSITORY . PHP_EOL; + } + + $report .= $message; + self::reportError($report); + + } + + /** + * Reports an error message to the user and ends execution + * + * @param string $message The error message + * + * @return void + */ + private static function reportError($message) { + + header('Content-Type: text/plain', true, 500); + die('Configuration error' . PHP_EOL . $message); + + } } From 3d231a417f3670d3e93fb63e9af9d62a5d17e87d Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Fri, 7 Jun 2019 20:38:07 +0200 Subject: [PATCH 30/42] bridges: Don't kill scripts with die() Bridges should generally utilize the API functions instead of killing the script. Find more information on the Wiki. - returnServerError https://github.com/RSS-Bridge/rss-bridge/wiki/The-returnServerError-function - returnClientError https://github.com/RSS-Bridge/rss-bridge/wiki/The-returnClientError-function - returnError https://github.com/RSS-Bridge/rss-bridge/wiki/The-returnError-function --- bridges/Arte7Bridge.php | 3 ++- bridges/FB2Bridge.php | 14 +++++++------- bridges/GOGBridge.php | 8 ++++---- bridges/Rue89Bridge.php | 5 +++-- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/bridges/Arte7Bridge.php b/bridges/Arte7Bridge.php index ff722113..562f648f 100644 --- a/bridges/Arte7Bridge.php +++ b/bridges/Arte7Bridge.php @@ -91,7 +91,8 @@ class Arte7Bridge extends BridgeAbstract { 'Authorization: Bearer ' . self::API_TOKEN ); - $input = getContents($url, $header) or die('Could not request ARTE.'); + $input = getContents($url, $header) + or returnServerError('Could not request ARTE.'); $input_json = json_decode($input, true); foreach($input_json['videos'] as $element) { diff --git a/bridges/FB2Bridge.php b/bridges/FB2Bridge.php index 29df7554..2faa3215 100644 --- a/bridges/FB2Bridge.php +++ b/bridges/FB2Bridge.php @@ -72,15 +72,15 @@ class FB2Bridge extends BridgeAbstract { $pageInfo = $this->getPageInfos($page, $cookies); if($pageInfo['userId'] === null) { - echo <<find('article') as $content) { $item = array(); - //echo $content; die(); + preg_match('/publish_time\\\":([0-9]+),/', $content->getAttribute('data-store', 0), $match); if(isset($match[1])) $timestamp = $match[1]; diff --git a/bridges/GOGBridge.php b/bridges/GOGBridge.php index 669332f0..09f47b4b 100644 --- a/bridges/GOGBridge.php +++ b/bridges/GOGBridge.php @@ -8,8 +8,8 @@ class GOGBridge extends BridgeAbstract { public function collectData() { - $values = getContents('https://www.gog.com/games/ajax/filtered?limit=25&sort=new') or - die('Unable to get the news pages from GOG !'); + $values = getContents('https://www.gog.com/games/ajax/filtered?limit=25&sort=new') + or returnServerError('Unable to get the news pages from GOG !'); $decodedValues = json_decode($values); $limit = 0; @@ -38,8 +38,8 @@ class GOGBridge extends BridgeAbstract { private function buildGameContentPage($game) { - $gameDescriptionText = getContents('https://api.gog.com/products/' . $game->id . '?expand=description') or - die('Unable to get game description from GOG !'); + $gameDescriptionText = getContents('https://api.gog.com/products/' . $game->id . '?expand=description') + or returnServerError('Unable to get game description from GOG !'); $gameDescriptionValue = json_decode($gameDescriptionText); diff --git a/bridges/Rue89Bridge.php b/bridges/Rue89Bridge.php index 934ef991..bbb14662 100644 --- a/bridges/Rue89Bridge.php +++ b/bridges/Rue89Bridge.php @@ -9,7 +9,7 @@ class Rue89Bridge extends BridgeAbstract { public function collectData() { $jsonArticles = getContents('https://appdata.nouvelobs.com/rue89/feed.json') - or die('Unable to query Rue89 !'); + or returnServerError('Unable to query Rue89 !'); $articles = json_decode($jsonArticles)->items; foreach($articles as $article) { $this->items[] = $this->getArticle($article); @@ -19,7 +19,8 @@ class Rue89Bridge extends BridgeAbstract { private function getArticle($articleInfo) { - $articleJson = getContents($articleInfo->json_url) or die('Unable to get article !'); + $articleJson = getContents($articleInfo->json_url) + or returnServerError('Unable to get article !'); $article = json_decode($articleJson); $item = array(); $item['title'] = $article->title; From 69a04987327b773606a10889ba6dcc58b58b7a53 Mon Sep 17 00:00:00 2001 From: Squirrel Date: Sat, 8 Jun 2019 20:53:26 +0800 Subject: [PATCH 31/42] [README] Add deploy button to Heroku (#1150) * Add deploy button to Heroku * Add composer.json and composer.lock (required by Heroku) --- README.md | 1 + app.json | 8 ++++++++ composer.json | 12 ++++++++++++ composer.lock | 26 ++++++++++++++++++++++++++ 4 files changed, 47 insertions(+) create mode 100644 app.json create mode 100644 composer.json create mode 100644 composer.lock diff --git a/README.md b/README.md index 530f90fe..28873c4a 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,7 @@ Thanks to the community, hosting your own instance of RSS-Bridge is as easy as c [![Deploy on Scalingo](https://cdn.scalingo.com/deploy/button.svg)](https://my.scalingo.com/deploy?source=https://github.com/sebsauvage/rss-bridge) [![Deploy to Docker Cloud](https://files.cloud.docker.com/images/deploy-to-dockercloud.svg)](https://cloud.docker.com/stack/deploy/?repo=https://github.com/rss-bridge/rss-bridge) +[![Deploy to Heroku](https://www.herokucdn.com/deploy/button.svg)](https://heroku.com/deploy) Getting involved === diff --git a/app.json b/app.json new file mode 100644 index 00000000..f1847995 --- /dev/null +++ b/app.json @@ -0,0 +1,8 @@ +{ + "service": "Heroku", + "name": "RSS-Bridge", + "description": "RSS-Bridge is a PHP project capable of generating RSS and Atom feeds for websites which don't have one.", + "repository": "https://github.com/RSS-Bridge/rss-bridge", + "keywords": ["php", "rss-bridge", "rss"] +} + diff --git a/composer.json b/composer.json new file mode 100644 index 00000000..8748cb39 --- /dev/null +++ b/composer.json @@ -0,0 +1,12 @@ +{ + "require": { + "php": ">=5.6", + "ext-mbstring": "*", + "ext-sqlite3": "*", + "ext-curl": "*", + "ext-openssl": "*", + "ext-libxml": "*", + "ext-simplexml": "*", + "ext-json": "*" + } +} diff --git a/composer.lock b/composer.lock new file mode 100644 index 00000000..3d8d9f2f --- /dev/null +++ b/composer.lock @@ -0,0 +1,26 @@ +{ + "_readme": [ + "This file locks the dependencies of your project to a known state", + "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", + "This file is @generated automatically" + ], + "content-hash": "ef341ee18f28c7bd5832e188fe157734", + "packages": [], + "packages-dev": [], + "aliases": [], + "minimum-stability": "stable", + "stability-flags": [], + "prefer-stable": false, + "prefer-lowest": false, + "platform": { + "php": ">=5.6", + "ext-mbstring": "*", + "ext-sqlite3": "*", + "ext-curl": "*", + "ext-openssl": "*", + "ext-libxml": "*", + "ext-simplexml": "*", + "ext-json": "*" + }, + "platform-dev": [] +} From ca1a5feba5ac5d4a173983316fdc23c4bb396073 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 8 Jun 2019 14:58:18 +0200 Subject: [PATCH 32/42] [.gitattributes] Annotate export-ignore sections --- .gitattributes | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index a141bb12..84736e47 100644 --- a/.gitattributes +++ b/.gitattributes @@ -22,13 +22,19 @@ *.RTF diff=astextplain # Ignore files in git archive (i.e. GitHub release builds) +## Docker Dockerfile export-ignore +.dockerignore export-ignore +## Travis .travis.yml export-ignore +## GitHub .github/ export-ignore +## Git .gitattributes export-ignore .gitignore export-ignore -.dockerignore export-ignore +## Scalingo scalingo.json export-ignore +## RSS-Bridge phpunit.xml export-ignore phpcs.xml export-ignore phpcompatibility.xml export-ignore From b74dda7af915b35dd13d6c7e87bc9df1a6d72027 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 8 Jun 2019 15:00:01 +0200 Subject: [PATCH 33/42] [.gitattributes] Exclude Composer and Heroku files from release builds --- .gitattributes | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index 84736e47..b789ff07 100644 --- a/.gitattributes +++ b/.gitattributes @@ -39,4 +39,9 @@ phpunit.xml export-ignore phpcs.xml export-ignore phpcompatibility.xml export-ignore tests/ export-ignore -cache/.gitkeep export-ignore \ No newline at end of file +cache/.gitkeep export-ignore +## Composer +composer.json export-ignore +composer.lock export-ignore +## Heroku +app.json export-ignore From 95388cdf4434cda2c6a23e11a72f61000c180727 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 8 Jun 2019 15:03:25 +0200 Subject: [PATCH 34/42] [.gitattributes] Exclude demo bridges from release builds --- .gitattributes | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/.gitattributes b/.gitattributes index b789ff07..6bd62b44 100644 --- a/.gitattributes +++ b/.gitattributes @@ -23,25 +23,27 @@ # Ignore files in git archive (i.e. GitHub release builds) ## Docker -Dockerfile export-ignore -.dockerignore export-ignore +Dockerfile export-ignore +.dockerignore export-ignore ## Travis -.travis.yml export-ignore +.travis.yml export-ignore ## GitHub -.github/ export-ignore +.github/ export-ignore ## Git -.gitattributes export-ignore -.gitignore export-ignore +.gitattributes export-ignore +.gitignore export-ignore ## Scalingo -scalingo.json export-ignore +scalingo.json export-ignore ## RSS-Bridge -phpunit.xml export-ignore -phpcs.xml export-ignore -phpcompatibility.xml export-ignore -tests/ export-ignore -cache/.gitkeep export-ignore +phpunit.xml export-ignore +phpcs.xml export-ignore +phpcompatibility.xml export-ignore +tests/ export-ignore +cache/.gitkeep export-ignore +bridges/DemoBridge.php export-ignore +bridges/FeedExpanderExampleBridge.php export-ignore ## Composer -composer.json export-ignore -composer.lock export-ignore +composer.json export-ignore +composer.lock export-ignore ## Heroku -app.json export-ignore +app.json export-ignore From 69dd33ac826a06ec04b64dc35655511e050bee37 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 8 Jun 2019 15:07:08 +0200 Subject: [PATCH 35/42] [.gitattributes] Use the same indentation style for the entire file --- .gitattributes | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.gitattributes b/.gitattributes index 6bd62b44..13ebe2ca 100644 --- a/.gitattributes +++ b/.gitattributes @@ -10,16 +10,16 @@ *.dbproj merge=union # Standard to msysgit -*.doc diff=astextplain -*.DOC diff=astextplain -*.docx diff=astextplain -*.DOCX diff=astextplain -*.dot diff=astextplain -*.DOT diff=astextplain -*.pdf diff=astextplain -*.PDF diff=astextplain -*.rtf diff=astextplain -*.RTF diff=astextplain +*.doc diff=astextplain +*.DOC diff=astextplain +*.docx diff=astextplain +*.DOCX diff=astextplain +*.dot diff=astextplain +*.DOT diff=astextplain +*.pdf diff=astextplain +*.PDF diff=astextplain +*.rtf diff=astextplain +*.RTF diff=astextplain # Ignore files in git archive (i.e. GitHub release builds) ## Docker From 84450371b5491757bcf6c3672b6b165aa6d08285 Mon Sep 17 00:00:00 2001 From: LogMANOriginal Date: Sat, 8 Jun 2019 15:19:56 +0200 Subject: [PATCH 36/42] [README] Remove Deploy to Docker Cloud button In December 2018 Docker Cloud has become part of Docker Hub: https://blog.docker.com/2018/12/the-new-docker-hub/ Since then the "Deploy to Docker Cloud" button is broken (error 404) with no alternative for Docker Hub, so the button should be removed. Docker images are still available at https://hub.docker.com/r/rssbridge/rss-bridge/ --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 28873c4a..2714f206 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,6 @@ Deploy Thanks to the community, hosting your own instance of RSS-Bridge is as easy as clicking a button! [![Deploy on Scalingo](https://cdn.scalingo.com/deploy/button.svg)](https://my.scalingo.com/deploy?source=https://github.com/sebsauvage/rss-bridge) -[![Deploy to Docker Cloud](https://files.cloud.docker.com/images/deploy-to-dockercloud.svg)](https://cloud.docker.com/stack/deploy/?repo=https://github.com/rss-bridge/rss-bridge) [![Deploy to Heroku](https://www.herokucdn.com/deploy/button.svg)](https://heroku.com/deploy) Getting involved From f28cbecc025fdef3e3e724b2dbb576886d5209fb Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 8 Jun 2019 15:50:13 +0200 Subject: [PATCH 37/42] [style] Fix placeholder should be hidden on focus The placeholder is currently visible on key focus and only hidden once a user starts typing. This can be confusing and doesn't look good. As it turns out, ::placeholder is an official selector: https://developer.mozilla.org/en-US/docs/Web/CSS/::placeholder For some reason, listing placeholder selectors with "," doesn't work on some browsers (tested in FF 60 ESR). Making each of the selectors explicit works, however. --- static/style.css | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/static/style.css b/static/style.css index fa70f2d9..4abf9f96 100644 --- a/static/style.css +++ b/static/style.css @@ -84,6 +84,12 @@ input[type="number"]:focus { border-color: #888; } +input:focus::-webkit-input-placeholder { opacity: 0; } +input:focus::-moz-placeholder { opacity: 0; } +input:focus::placeholder { opacity: 0; } +input:focus:-moz-placeholder { opacity: 0; } +input:focus:-ms-input-placeholder { opacity: 0; } + .searchbar { width: 40%; margin: 40px auto 100px; @@ -101,13 +107,6 @@ input[type="number"]:focus { text-align: center; } -.searchbar input[type="text"]:focus::-webkit-input-placeholder, -.searchbar input[type="text"]:focus::-moz-placeholder, -.searchbar input[type="text"]:focus:-moz-placeholder, -.searchbar input[type="text"]:focus:-ms-input-placeholder { - opacity: 0; -} - .searchbar > h3 { font-size: 200%; font-weight: bold; From 17f587fcbe314bf96ad4e470058bf67401fa8fa0 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 8 Jun 2019 16:16:01 +0200 Subject: [PATCH 38/42] [index] Don't set the timezone in index.php --- index.php | 2 -- 1 file changed, 2 deletions(-) diff --git a/index.php b/index.php index 0bc63625..666b9e45 100644 --- a/index.php +++ b/index.php @@ -6,8 +6,6 @@ Configuration::loadConfiguration(); Authentication::showPromptIfNeeded(); -date_default_timezone_set('UTC'); - /* Move the CLI arguments to the $_GET array, in order to be able to use rss-bridge from the command line From 5a9519967b0bb6b16e43e495c63d13c6fb41c021 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 8 Jun 2019 17:01:38 +0200 Subject: [PATCH 39/42] [Exceptions] Add button to search for similar issues on GitHub Users currently only get one option: to open a new issue on GitHub. This can, however, result in duplicate issues, which is not desired. This commit adds a second button to the error message, which links to the GitHub issues tracker with the search query set to find errors for the current bridge. That way, users can collaborate on the same issue. --- lib/Exceptions.php | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/lib/Exceptions.php b/lib/Exceptions.php index 112580de..c749780c 100644 --- a/lib/Exceptions.php +++ b/lib/Exceptions.php @@ -11,6 +11,15 @@ * @link https://github.com/rss-bridge/rss-bridge */ +/** + * Builds a GitHub search query to find open bugs for the current bridge + */ +function buildGitHubSearchQuery($bridgeName){ + return REPOSITORY + . 'issues?q=' + . urlencode('is:issue is:open ' . $bridgeName); +} + /** * Returns an URL that automatically populates a new issue on GitHub based * on the information provided @@ -84,6 +93,7 @@ function buildBridgeException($e, $bridge){ $body_html = nl2br($body); $link = buildGitHubIssueQuery($title, $body, 'Bridge-Broken', $bridge->getMaintainer()); + $searchQuery = buildGitHubSearchQuery($bridge::NAME); $header = buildHeader($e, $bridge); $message = << {$body_html} EOD; - $section = buildSection($e, $bridge, $message, $link); + $section = buildSection($e, $bridge, $message, $link, $searchQuery); return $section; } @@ -120,10 +130,11 @@ function buildTransformException($e, $bridge){ . '`'; $link = buildGitHubIssueQuery($title, $body, 'Bridge-Broken', $bridge->getMaintainer()); + $searchQuery = buildGitHubSearchQuery($bridge::NAME); $header = buildHeader($e, $bridge); $message = "RSS-Bridge was unable to transform the contents returned by {$bridge->getName()}!"; - $section = buildSection($e, $bridge, $message, $link); + $section = buildSection($e, $bridge, $message, $link, $searchQuery); return buildPage($title, $header, $section); } @@ -154,11 +165,12 @@ EOD; * @param object $bridge The bridge object * @param string $message The message to display * @param string $link The link to include in the anchor + * @param string $searchQuery A GitHub search query for the current bridge * @return string The HTML section * * @todo This function belongs inside a class */ -function buildSection($e, $bridge, $message, $link){ +function buildSection($e, $bridge, $message, $link, $searchQuery){ return <<

{$message}

@@ -166,9 +178,13 @@ function buildSection($e, $bridge, $message, $link){
  • Press Return to check your input parameters
  • Press F5 to retry
  • +
  • Check if this issue was already reported on GitHub (give it a thumbs-up)
  • Open a GitHub Issue if this error persists
+ + +

{$bridge->getMaintainer()}

From f00a054e0fcac783ae54b2aedaff8e054f8787dc Mon Sep 17 00:00:00 2001 From: Joseph Date: Sat, 8 Jun 2019 17:30:42 +0000 Subject: [PATCH 40/42] [BrutBridge] Add new bridge (#1159) --- bridges/BrutBridge.php | 142 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 bridges/BrutBridge.php diff --git a/bridges/BrutBridge.php b/bridges/BrutBridge.php new file mode 100644 index 00000000..432cb502 --- /dev/null +++ b/bridges/BrutBridge.php @@ -0,0 +1,142 @@ + array( + 'name' => 'Category', + 'type' => 'list', + 'values' => array( + 'News' => 'news', + 'International' => 'international', + 'Economy' => 'economy', + 'Science and Technology' => 'science-and-technology', + 'Entertainment' => 'entertainment', + 'Sports' => 'sport', + 'Nature' => 'nature', + ), + 'defaultValue' => 'news', + ), + 'edition' => array( + 'name' => ' Edition', + 'type' => 'list', + 'values' => array( + 'United States' => 'us', + 'United Kingdom' => 'uk', + 'France' => 'fr', + 'India' => 'in', + 'Mexico' => 'mx', + ), + 'defaultValue' => 'us', + ) + ) + ); + + const CACHE_TIMEOUT = 1800; // 30 mins + + private $videoId = ''; + private $videoType = ''; + private $videoImage = ''; + + public function collectData() { + + $html = getSimpleHTMLDOM($this->getURI()) + or returnServerError('Could not request: ' . $this->getURI()); + + $results = $html->find('div.results', 0); + + foreach($results->find('li.col-6.col-sm-4.col-md-3.col-lg-2.px-2.pb-4') as $index => $li) { + $item = array(); + + $videoPath = self::URI . $li->children(0)->href; + + $videoPageHtml = getSimpleHTMLDOMCached($videoPath, 3600) + or returnServerError('Could not request: ' . $videoPath); + + $this->videoImage = $videoPageHtml->find('meta[name="twitter:image"]', 0)->content; + + $this->processTwitterImage(); + + $description = $videoPageHtml->find('div.description', 0); + + $item['uri'] = $videoPath; + $item['title'] = $description->find('h1', 0)->plaintext; + + if ($description->find('div.date', 0)->children(0)) { + $description->find('div.date', 0)->children(0)->outertext = ''; + } + + $item['content'] = $this->processContent( + $description + ); + + $item['timestamp'] = $this->processDate($description); + $item['enclosures'][] = $this->videoImage; + + $this->items[] = $item; + + if (count($this->items) >= 5) { + break; + } + } + } + + public function getURI() { + + if (!is_null($this->getInput('edition')) && !is_null($this->getInput('category'))) { + return self::URI . '/' . $this->getInput('edition') . '/' . $this->getInput('category'); + } + + return parent::getURI(); + } + + private function processDate($description) { + + if ($this->getInput('edition') === 'uk') { + $date = DateTime::createFromFormat('d/m/Y H:i', $description->find('div.date', 0)->innertext); + return strtotime($date->format('Y-m-d H:i:s')); + } + + return strtotime($description->find('div.date', 0)->innertext); + } + + private function processContent($description) { + + $content = ''; + $content .= '

' . $description->find('h2.mb-1', 0)->innertext . '

'; + + if ($description->find('div.text.pb-3', 0)->children(1)->class != 'date') { + $content .= '

' . $description->find('div.text.pb-3', 0)->children(1)->innertext . '

'; + } + + return $content; + } + + private function processTwitterImage() { + /** + * Extract video ID + type from twitter image + * + * Example (wrapped): + * https://img.brut.media/thumbnail/ + * the-life-of-rita-moreno-2cce75b5-d448-44d2-a97c-ca50d6470dd4-square.jpg + * ?ts=1559337892 + */ + $fpath = parse_url($this->videoImage, PHP_URL_PATH); + $fname = basename($fpath); + $fname = substr($fname, 0, strrpos($fname, '.')); + $parts = explode('-', $fname); + + if (end($parts) === 'auto') { + $key = array_search('auto', $parts); + unset($parts[$key]); + } + + $this->videoId = implode('-', array_splice($parts, -6, 5)); + $this->videoType = end($parts); + } +} From 5ff3d0121c47a83dc1cc55c18ae6023129c10eee Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 8 Jun 2019 20:04:06 +0200 Subject: [PATCH 41/42] [README] Update list of contributors --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2714f206..95086c03 100644 --- a/README.md +++ b/README.md @@ -116,13 +116,14 @@ https://gist.github.com/LogMANOriginal/da00cd1e5f0ca31cef8e193509b17fd8 * [Ahiles3005](https://github.com/Ahiles3005) * [Albirew](https://github.com/Albirew) * [aledeg](https://github.com/aledeg) + * [alex73](https://github.com/alex73) * [alexAubin](https://github.com/alexAubin) * [AmauryCarrade](https://github.com/AmauryCarrade) - * [AntoineTurmel](https://github.com/AntoineTurmel) * [ArthurHoaro](https://github.com/ArthurHoaro) * [Astalaseven](https://github.com/Astalaseven) * [Astyan-42](https://github.com/Astyan-42) * [az5he6ch](https://github.com/az5he6ch) + * [azdkj532](https://github.com/azdkj532) * [b1nj](https://github.com/b1nj) * [benasse](https://github.com/benasse) * [captn3m0](https://github.com/captn3m0) @@ -156,6 +157,7 @@ https://gist.github.com/LogMANOriginal/da00cd1e5f0ca31cef8e193509b17fd8 * [jdigilio](https://github.com/jdigilio) * [JeremyRand](https://github.com/JeremyRand) * [Jocker666z](https://github.com/Jocker666z) + * [killruana](https://github.com/killruana) * [klimplant](https://github.com/klimplant) * [kranack](https://github.com/kranack) * [kraoc](https://github.com/kraoc) @@ -173,7 +175,6 @@ https://gist.github.com/LogMANOriginal/da00cd1e5f0ca31cef8e193509b17fd8 * [mdemoss](https://github.com/mdemoss) * [melangue](https://github.com/melangue) * [metaMMA](https://github.com/metaMMA) - * [mickael-bertrand](https://github.com/mickael-bertrand) * [mitsukarenai](https://github.com/mitsukarenai) * [MonsieurPoutounours](https://github.com/MonsieurPoutounours) * [mr-flibble](https://github.com/mr-flibble) @@ -208,8 +209,10 @@ https://gist.github.com/LogMANOriginal/da00cd1e5f0ca31cef8e193509b17fd8 * [thefranke](https://github.com/thefranke) * [TheRadialActive](https://github.com/TheRadialActive) * [triatic](https://github.com/triatic) + * [VerifiedJoseph](https://github.com/VerifiedJoseph) * [WalterBarrett](https://github.com/WalterBarrett) * [wtuuju](https://github.com/wtuuju) + * [xurxof](https://github.com/xurxof) * [yardenac](https://github.com/yardenac) * [ZeNairolf](https://github.com/ZeNairolf) From c17b8642425dc946b6e787958fed630bf93b2029 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 8 Jun 2019 20:04:57 +0200 Subject: [PATCH 42/42] [Configuration] Bump version to 2019-06-08 --- lib/Configuration.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Configuration.php b/lib/Configuration.php index d5d7cac1..d6a31dfc 100644 --- a/lib/Configuration.php +++ b/lib/Configuration.php @@ -28,7 +28,7 @@ final class Configuration { * * @todo Replace this property by a constant. */ - public static $VERSION = 'dev.2019-05-08'; + public static $VERSION = '2019-06-08'; /** * Holds the configuration data.