From 005b22701d17d72b6273242c0b735597311db4b4 Mon Sep 17 00:00:00 2001 From: Simon816 <1497508+simon816@users.noreply.github.com> Date: Fri, 29 Jan 2021 18:27:35 +0000 Subject: [PATCH] [FSecureBlogBridge] Add bridge (#1932) --- bridges/FSecureBlogBridge.php | 115 ++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 bridges/FSecureBlogBridge.php diff --git a/bridges/FSecureBlogBridge.php b/bridges/FSecureBlogBridge.php new file mode 100644 index 00000000..47910960 --- /dev/null +++ b/bridges/FSecureBlogBridge.php @@ -0,0 +1,115 @@ + array( + 'categories' => array( + 'name' => 'Blog categories', + 'exampleValue' => 'home-security', + ), + 'language' => array( + 'name' => 'Language', + 'defaultValue' => 'en', + ), + 'oldest_date' => array( + 'name' => 'Oldest article date', + 'exampleValue' => '-2 months', + ), + ) + ); + + public function getURI() { + $lang = $this->getInput('language') or 'en'; + if ($lang === 'en') { + return self::URI; + } + return self::URI . "/$lang"; + } + + public function collectData() { + $this->items = array(); + $this->seen = array(); + + $this->oldest = strtotime($this->getInput('oldest_date')) ?: 0; + + $categories = $this->getInput('categories'); + if (!empty($categories)) { + foreach (explode(',', $categories) as $cat) { + if (!empty($cat)) { + $this->collectCategory($cat); + } + } + return; + } + + $html = getSimpleHTMLDOMCached($this->getURI() . '/'); + + foreach ($html->find('ul.c-header-menu-desktop__list li a') as $link) { + $url = parse_url($link->href); + if (($pos = strpos($url['path'], '/category/')) !== false) { + $cat = substr($url['path'], $pos + strlen('/category/'), -1); + $this->collectCategory($cat); + } + } + } + + private function collectCategory($category) { + $url = $this->getURI() . "/category/$category/"; + while ($url) { + $url = $this->collectListing($url); + } + } + + // n.b. this relies on articles to be ordered by date so the cutoff works + private function collectListing($url) { + $html = getSimpleHTMLDOMCached($url, 60 * 60); + $items = $html->find('section.b-blog .l-blog__content__listing div.c-listing-item'); + + $catName = trim($html->find('section.b-blog .c-blog-header__title', 0)->plaintext); + + foreach ($items as $item) { + $url = $item->getAttribute('data-url'); + if (!$this->collectArticle($url)) { + return null; // Too old, stop collecting + } + } + + // Point's to 404 for non-english blog + // $next = $html->find('link[rel=next]', 0); + $next = $html->find('ul.page-numbers a.next', 0); + return $next ? $next->href : null; + } + + // Returns a boolean whether to continue collecting articles + // i.e. date is after oldest cutoff + private function collectArticle($url) { + if (array_key_exists($url, $this->seen)) { + return true; + } + $html = getSimpleHTMLDOMCached($url); + + $rssItem = array( 'uri' => $url, 'uid' => $url ); + $rssItem['title'] = $html->find('meta[property=og:title]', 0)->content; + $dt = $html->find('meta[property=article:published_time]', 0)->content; + // Exit if too old + if (strtotime($dt) < $this->oldest) { + return false; + } + $rssItem['timestamp'] = $dt; + $img = $html->find('meta[property=og:image]', 0); + $rssItem['enclosures'] = $img ? array($img->content) : array(); + $rssItem['author'] = trim($html->find('.c-blog-author__text a', 0)->plaintext); + $rssItem['categories'] = array_map(function ($link) { + return trim($link->plaintext); + }, $html->find('.b-single-header__categories .c-category-list a')); + $rssItem['content'] = trim($html->find('article', 0)->innertext); + + $this->items[] = $rssItem; + $this->seen[$url] = 1; + return true; + } +}