diff --git a/bridges/GQMagazineBridge.php b/bridges/GQMagazineBridge.php new file mode 100644 index 00000000..fa36c4e7 --- /dev/null +++ b/bridges/GQMagazineBridge.php @@ -0,0 +1,119 @@ + array( + 'name' => 'Domain to use', + 'required' => true, + 'values' => array( + 'www.gqmagazine.fr' => 'www.gqmagazine.fr' + ), + 'defaultValue' => 'www.gqmagazine.fr' + ), + 'page' => array( + 'name' => 'Initial page to load', + 'required' => true + ), + )); + + const REPLACED_ATTRIBUTES = array( + 'href' => 'href', + 'src' => 'src', + 'data-original' => 'src' + ); + + private function getDomain() { + return $this->getInput('domain'); + } + + public function getURI() + { + return $this->getDomain() . '/' . $this->getInput('page'); + } + + public function collectData() + { + $html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI()); + + // Since GQ don't want simple class scrapping, let's do it the hard way and ... discover content ! + $main = $html->find('main', 0); + foreach ($main->find('a') as $link) { + $uri = $link->href; + $title = $link->find('h2', 0); + $date = $link->find('time', 0); + + $item = array(); + $author = $link->find('span[itemprop=name]', 0); + $item['author'] = $author->plaintext; + $item['title'] = $title->plaintext; + if(substr($uri, 0, 1) === 'h') { // absolute uri + $item['uri'] = $uri; + } else if(substr($uri, 0, 1) === '/') { // domain relative url + $item['uri'] = $this->getDomain() . $uri; + } else { + $item['uri'] = $this->getDomain() . '/' . $uri; + } + + $article = $this->loadFullArticle($item['uri']); + if($article) { + $item['content'] = $this->replaceUriInHtmlElement($article); + } else { + $item['content'] = "Article body couldn't be loaded. It must be a bug!"; + } + $short_date = $date->datetime; + $item['timestamp'] = strtotime($short_date); + $this->items[] = $item; + } + } + + /** + * Loads the full article and returns the contents + * @param $uri The article URI + * @return The article content + */ + private function loadFullArticle($uri){ + $html = getSimpleHTMLDOMCached($uri); + // Once again, that generated css classes madness is an obstacle ... which i can go over easily + foreach($html->find('div') as $div) { + // List the CSS classes of that div + $classes = $div->class; + // I can't directly lookup that class since GQ since to generate random names like "ArticleBodySection-fkggUW" + if(strpos($classes, 'ArticleBodySection') !== false) { + return $div; + } + } + return null; + } + + /** + * Replaces all relative URIs with absolute ones + * @param $element A simplehtmldom element + * @return The $element->innertext with all URIs replaced + */ + private function replaceUriInHtmlElement($element){ + $returned = $element->innertext; + foreach (self::REPLACED_ATTRIBUTES as $initial => $final) { + $returned = str_replace($initial . '="/', $final . '="' . self::URI . '/', $returned); + } + return $returned; + } +} diff --git a/bridges/SexactuBridge.php b/bridges/SexactuBridge.php deleted file mode 100644 index b0a71745..00000000 --- a/bridges/SexactuBridge.php +++ /dev/null @@ -1,88 +0,0 @@ - 'href', - 'src' => 'src', - 'data-original' => 'src' - ); - - public function getURI(){ - return self::URI . '/sexactu'; - } - - public function collectData(){ - $html = getSimpleHTMLDOM($this->getURI()) - or returnServerError('Could not request ' . $this->getURI()); - - $sexactu = $html->find('.container_sexactu', 0); - $rowList = $sexactu->find('.row'); - foreach($rowList as $row) { - // only use first list as second one only contains pages numbers - - $title = $row->find('.title', 0); - if($title) { - $item = array(); - $item['author'] = self::AUTHOR; - $item['title'] = $title->plaintext; - $urlAttribute = 'data-href'; - $uri = $title->$urlAttribute; - if($uri === false) - continue; - if(substr($uri, 0, 1) === 'h') { // absolute uri - $item['uri'] = $uri; - } else if(substr($uri, 0, 1) === '/') { // domain relative url - $item['uri'] = self::URI . $uri; - } else { - $item['uri'] = $this->getURI() . $uri; - } - $article = $this->loadFullArticle($item['uri']); - $item['content'] = $this->replaceUriInHtmlElement($article->find('.article_content', 0)); - - $publicationDate = $article->find('time[itemprop=datePublished]', 0); - $short_date = $publicationDate->datetime; - $item['timestamp'] = strtotime($short_date); - } else { - // Sometimes we get rubbish, ignore. - continue; - } - $this->items[] = $item; - } - } - - /** - * Loads the full article and returns the contents - * @param $uri The article URI - * @return The article content - */ - private function loadFullArticle($uri){ - $html = getSimpleHTMLDOMCached($uri); - - $content = $html->find('#article', 0); - if($content) { - return $content; - } - - return null; - } - - /** - * Replaces all relative URIs with absolute ones - * @param $element A simplehtmldom element - * @return The $element->innertext with all URIs replaced - */ - private function replaceUriInHtmlElement($element){ - $returned = $element->innertext; - foreach (self::REPLACED_ATTRIBUTES as $initial => $final) { - $returned = str_replace($initial . '="/', $final . '="' . self::URI . '/', $returned); - } - return $returned; - } -}