array( 'name' => 'Domain to use', 'required' => true, 'defaultValue' => self::DEFAULT_DOMAIN ), 'page' => array( 'name' => 'Initial page to load', 'required' => true, 'exampleValue' => 'sexe/news' ), )); const REPLACED_ATTRIBUTES = array( 'href' => 'href', 'src' => 'src', 'data-original' => 'src' ); const POSSIBLE_TITLES = array( 'h2', 'h3' ); private function getDomain() { $domain = $this->getInput('domain'); if (empty($domain)) $domain = self::DEFAULT_DOMAIN; if (strpos($domain, '://') === false) $domain = 'https://' . $domain; return $domain; } public function getURI() { return $this->getDomain() . '/' . $this->getInput('page'); } private function findTitleOf($link) { foreach (self::POSSIBLE_TITLES as $tag) { $title = $link->parent()->find($tag, 0); if($title !== null) { if($title->plaintext !== null) { return $title->plaintext; } } } } public function collectData() { $html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI()); // Since GQ don't want simple class scrapping, let's do it the hard way and ... discover content ! $main = $html->find('main', 0); foreach ($main->find('a') as $link) { if(strpos($link, $this->getInput('page'))) continue; $uri = $link->href; $date = $link->parent()->find('time', 0); $item = array(); $author = $link->parent()->find('span[itemprop=name]', 0); if($author !== null) { $item['author'] = $author->plaintext; $item['title'] = $this->findTitleOf($link); switch(substr($uri, 0, 1)) { case 'h': // absolute uri $item['uri'] = $uri; break; case '/': // domain relative uri $item['uri'] = $this->getDomain() . $uri; break; default: $item['uri'] = $this->getDomain() . '/' . $uri; } $article = $this->loadFullArticle($item['uri']); if($article) { $item['content'] = $this->replaceUriInHtmlElement($article); } else { $item['content'] = "Article body couldn't be loaded. It must be a bug!"; } $short_date = $date->datetime; $item['timestamp'] = strtotime($short_date); $this->items[] = $item; } } } /** * Loads the full article and returns the contents * @param $uri The article URI * @return The article content */ private function loadFullArticle($uri){ $html = getSimpleHTMLDOMCached($uri); return $html->find('section[data-test-id=MainContentWrapper]', 0); } /** * Replaces all relative URIs with absolute ones * @param $element A simplehtmldom element * @return The $element->innertext with all URIs replaced */ private function replaceUriInHtmlElement($element){ $returned = $element->innertext; foreach (self::REPLACED_ATTRIBUTES as $initial => $final) { $returned = str_replace($initial . '="/', $final . '="' . self::URI . '/', $returned); } return $returned; } }