From ee158468fa2e258ad9f947b03ab13d5f0e961c6f Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Mon, 15 Oct 2018 18:09:20 +0200 Subject: [PATCH] Expanded Sexactu to cover the whole GQ magazine (#861) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bridge has been expanded to better cover the whole GQ magazine. It should support all countries (provided they all use the same absurdly shitty publication system). It is guaranteed to be only tested with sexactu articles (that I now obtain by loading Maïa Mazaurette author page). --- bridges/GQMagazineBridge.php | 119 +++++++++++++++++++++++++++++++++++ bridges/SexactuBridge.php | 88 -------------------------- 2 files changed, 119 insertions(+), 88 deletions(-) create mode 100644 bridges/GQMagazineBridge.php delete mode 100644 bridges/SexactuBridge.php diff --git a/bridges/GQMagazineBridge.php b/bridges/GQMagazineBridge.php new file mode 100644 index 00000000..fa36c4e7 --- /dev/null +++ b/bridges/GQMagazineBridge.php @@ -0,0 +1,119 @@ + array( + 'name' => 'Domain to use', + 'required' => true, + 'values' => array( + 'www.gqmagazine.fr' => 'www.gqmagazine.fr' + ), + 'defaultValue' => 'www.gqmagazine.fr' + ), + 'page' => array( + 'name' => 'Initial page to load', + 'required' => true + ), + )); + + const REPLACED_ATTRIBUTES = array( + 'href' => 'href', + 'src' => 'src', + 'data-original' => 'src' + ); + + private function getDomain() { + return $this->getInput('domain'); + } + + public function getURI() + { + return $this->getDomain() . '/' . $this->getInput('page'); + } + + public function collectData() + { + $html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI()); + + // Since GQ don't want simple class scrapping, let's do it the hard way and ... discover content ! + $main = $html->find('main', 0); + foreach ($main->find('a') as $link) { + $uri = $link->href; + $title = $link->find('h2', 0); + $date = $link->find('time', 0); + + $item = array(); + $author = $link->find('span[itemprop=name]', 0); + $item['author'] = $author->plaintext; + $item['title'] = $title->plaintext; + if(substr($uri, 0, 1) === 'h') { // absolute uri + $item['uri'] = $uri; + } else if(substr($uri, 0, 1) === '/') { // domain relative url + $item['uri'] = $this->getDomain() . $uri; + } else { + $item['uri'] = $this->getDomain() . '/' . $uri; + } + + $article = $this->loadFullArticle($item['uri']); + if($article) { + $item['content'] = $this->replaceUriInHtmlElement($article); + } else { + $item['content'] = "Article body couldn't be loaded. It must be a bug!"; + } + $short_date = $date->datetime; + $item['timestamp'] = strtotime($short_date); + $this->items[] = $item; + } + } + + /** + * Loads the full article and returns the contents + * @param $uri The article URI + * @return The article content + */ + private function loadFullArticle($uri){ + $html = getSimpleHTMLDOMCached($uri); + // Once again, that generated css classes madness is an obstacle ... which i can go over easily + foreach($html->find('div') as $div) { + // List the CSS classes of that div + $classes = $div->class; + // I can't directly lookup that class since GQ since to generate random names like "ArticleBodySection-fkggUW" + if(strpos($classes, 'ArticleBodySection') !== false) { + return $div; + } + } + return null; + } + + /** + * Replaces all relative URIs with absolute ones + * @param $element A simplehtmldom element + * @return The $element->innertext with all URIs replaced + */ + private function replaceUriInHtmlElement($element){ + $returned = $element->innertext; + foreach (self::REPLACED_ATTRIBUTES as $initial => $final) { + $returned = str_replace($initial . '="/', $final . '="' . self::URI . '/', $returned); + } + return $returned; + } +} diff --git a/bridges/SexactuBridge.php b/bridges/SexactuBridge.php deleted file mode 100644 index b0a71745..00000000 --- a/bridges/SexactuBridge.php +++ /dev/null @@ -1,88 +0,0 @@ - 'href', - 'src' => 'src', - 'data-original' => 'src' - ); - - public function getURI(){ - return self::URI . '/sexactu'; - } - - public function collectData(){ - $html = getSimpleHTMLDOM($this->getURI()) - or returnServerError('Could not request ' . $this->getURI()); - - $sexactu = $html->find('.container_sexactu', 0); - $rowList = $sexactu->find('.row'); - foreach($rowList as $row) { - // only use first list as second one only contains pages numbers - - $title = $row->find('.title', 0); - if($title) { - $item = array(); - $item['author'] = self::AUTHOR; - $item['title'] = $title->plaintext; - $urlAttribute = 'data-href'; - $uri = $title->$urlAttribute; - if($uri === false) - continue; - if(substr($uri, 0, 1) === 'h') { // absolute uri - $item['uri'] = $uri; - } else if(substr($uri, 0, 1) === '/') { // domain relative url - $item['uri'] = self::URI . $uri; - } else { - $item['uri'] = $this->getURI() . $uri; - } - $article = $this->loadFullArticle($item['uri']); - $item['content'] = $this->replaceUriInHtmlElement($article->find('.article_content', 0)); - - $publicationDate = $article->find('time[itemprop=datePublished]', 0); - $short_date = $publicationDate->datetime; - $item['timestamp'] = strtotime($short_date); - } else { - // Sometimes we get rubbish, ignore. - continue; - } - $this->items[] = $item; - } - } - - /** - * Loads the full article and returns the contents - * @param $uri The article URI - * @return The article content - */ - private function loadFullArticle($uri){ - $html = getSimpleHTMLDOMCached($uri); - - $content = $html->find('#article', 0); - if($content) { - return $content; - } - - return null; - } - - /** - * Replaces all relative URIs with absolute ones - * @param $element A simplehtmldom element - * @return The $element->innertext with all URIs replaced - */ - private function replaceUriInHtmlElement($element){ - $returned = $element->innertext; - foreach (self::REPLACED_ATTRIBUTES as $initial => $final) { - $returned = str_replace($initial . '="/', $final . '="' . self::URI . '/', $returned); - } - return $returned; - } -}