Rss-Bridge/bridges/GQMagazineBridge.php

136 lines
3.6 KiB
PHP
Raw Normal View History

<?php
/**
* An extension of the previous SexactuBridge to cover the whole GQMagazine.
* This one taks a page (as an example sexe/news or journaliste/maia-mazaurette) which is to be configured,
* reads all the articles visible on that page, and make a stream out of it.
* @author nicolas-delsaux
*
*/
class GQMagazineBridge extends BridgeAbstract
{
const MAINTAINER = 'Riduidel';
const NAME = 'GQMagazine';
// URI is no more valid, since we can address the whole gq galaxy
const URI = 'https://www.gqmagazine.fr';
const CACHE_TIMEOUT = 7200; // 2h
const DESCRIPTION = 'GQMagazine section extractor bridge. This bridge allows you get only a specific section.';
const DEFAULT_DOMAIN = 'www.gqmagazine.fr';
const PARAMETERS = array( array(
'domain' => array(
'name' => 'Domain to use',
'required' => true,
'defaultValue' => self::DEFAULT_DOMAIN
),
'page' => array(
'name' => 'Initial page to load',
'required' => true,
'exampleValue' => 'sexe/news'
),
));
const REPLACED_ATTRIBUTES = array(
'href' => 'href',
'src' => 'src',
'data-original' => 'src'
);
const POSSIBLE_TITLES = array(
'h2',
'h3'
);
private function getDomain() {
$domain = $this->getInput('domain');
if (empty($domain))
$domain = self::DEFAULT_DOMAIN;
if (strpos($domain, '://') === false)
$domain = 'https://' . $domain;
return $domain;
}
public function getURI()
{
return $this->getDomain() . '/' . $this->getInput('page');
}
private function findTitleOf($link) {
foreach (self::POSSIBLE_TITLES as $tag) {
$title = $link->parent()->find($tag, 0);
if($title !== null) {
if($title->plaintext !== null) {
return $title->plaintext;
}
}
}
}
public function collectData()
{
$html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI());
// Since GQ don't want simple class scrapping, let's do it the hard way and ... discover content !
$main = $html->find('main', 0);
foreach ($main->find('a') as $link) {
if(strpos($link, $this->getInput('page')))
continue;
$uri = $link->href;
$date = $link->parent()->find('time', 0);
$item = array();
$author = $link->parent()->find('span[itemprop=name]', 0);
if($author !== null) {
$item['author'] = $author->plaintext;
$item['title'] = $this->findTitleOf($link);
switch(substr($uri, 0, 1)) {
case 'h': // absolute uri
$item['uri'] = $uri;
break;
case '/': // domain relative uri
$item['uri'] = $this->getDomain() . $uri;
break;
default:
$item['uri'] = $this->getDomain() . '/' . $uri;
}
$article = $this->loadFullArticle($item['uri']);
if($article) {
$item['content'] = $this->replaceUriInHtmlElement($article);
} else {
$item['content'] = "<strong>Article body couldn't be loaded</strong>. It must be a bug!";
}
$short_date = $date->datetime;
$item['timestamp'] = strtotime($short_date);
$this->items[] = $item;
}
}
}
/**
* Loads the full article and returns the contents
* @param $uri The article URI
* @return The article content
*/
private function loadFullArticle($uri){
$html = getSimpleHTMLDOMCached($uri);
return $html->find('section[data-test-id=ArticleBodyContent]', 0);
}
/**
* Replaces all relative URIs with absolute ones
* @param $element A simplehtmldom element
* @return The $element->innertext with all URIs replaced
*/
private function replaceUriInHtmlElement($element){
$returned = $element->innertext;
foreach (self::REPLACED_ATTRIBUTES as $initial => $final) {
$returned = str_replace($initial . '="/', $final . '="' . self::URI . '/', $returned);
}
return $returned;
}
}