[SexactuBridge] Use most modern version of bridge api and cached pages (#504)

Fixed #503 to use most modern version of bridge api and cached pages
This commit is contained in:
Nicolas Delsaux 2017-04-09 21:15:01 +02:00 committed by LogMANOriginal
parent 360f9da072
commit f3b6b264d3

View file

@ -3,97 +3,87 @@ class SexactuBridge extends BridgeAbstract {
const MAINTAINER = 'Riduidel'; const MAINTAINER = 'Riduidel';
const NAME = 'Sexactu'; const NAME = 'Sexactu';
const URI = 'https://www.gqmagazine.fr'; const AUTHOR = 'Maïa Mazaurette';
const DOMAIN = 'http://www.gqmagazine.fr';
const CACHE_TIMEOUT = 7200; // 2h const CACHE_TIMEOUT = 7200; // 2h
const DESCRIPTION = 'Sexactu via rss-bridge'; const DESCRIPTION = 'Sexactu via rss-bridge';
const REPLACED_ATTRIBUTES = array(
'href' => 'href',
'src' => 'src',
'data-original' => 'src'
);
public function getURI(){
return self::DOMAIN . '/sexactu';
}
public function collectData(){ public function collectData(){
$find = array(
'janvier',
'février',
'mars',
'avril',
'mai',
'juin',
'juillet',
'août',
'septembre',
'novembre',
'décembre'
);
$replace = array(
'January',
'February',
'March',
'April',
'May',
'June',
'July',
'August',
'September',
'October',
'November',
'December'
);
$html = getSimpleHTMLDOM($this->getURI()) $html = getSimpleHTMLDOM($this->getURI())
or returnServerError('Could not request ' . $this->getURI()); or returnServerError('Could not request ' . $this->getURI());
foreach($html->find('.content-holder') as $contentHolder){ $sexactu = $html->find('.container_sexactu', 0);
$rowList = $sexactu->find('.row');
foreach($rowList as $row){
// only use first list as second one only contains pages numbers // only use first list as second one only contains pages numbers
$articles = $contentHolder->find('ul', 0);
foreach($articles->find('li') as $element){
// if you ask about that method_exists, there seems to be a bug in simple html dom
// see stackoverflow for more details : http://stackoverflow.com/a/10828479/15619
if(is_object($element)){
$item = array();
// various metadata
$titleBlock = $element->find('.title-holder', 0);
if(is_object($titleBlock)){
$titleDetails = $titleBlock->find('.article-title', 0);
$titleData = $titleDetails->find('h2', 0)->find('a', 0);
$titleTimestamp = $titleDetails->find('h4', 0);
$item['title'] = $this->correctCase(trim($titleData->innertext));
$item['uri'] = self::URI . $titleData->href;
// Fugly date parsing due to the fact my DNS-323 doesn't support php intl extension $title = $row->find('.title', 0);
$dateText = $titleTimestamp->innertext; if($title){
$dateText = substr($dateText, strpos($dateText, ',') + 1); $item = array();
$dateText = str_replace($find, $replace, strtolower($dateText)); $item['author'] = self::AUTHOR;
$date = strtotime($dateText); $item['title'] = $title->plaintext;
$item['timestamp'] = $date; $urlAttribute = "data-href";
$uri = $title->$urlAttribute;
$item['author'] = 'Maïa Mazaurette'; if($uri === false)
$elementText = $element->find('.text-container', 0); continue;
// don't forget to replace images server url with gq one if(substr($uri, 0, 1) === 'h'){ // absolute uri
foreach($elementText->find('img') as $image){ $item['uri'] = $uri;
$image->src = self::URI . $image->src; } else if(substr($uri, 0, 1) === '/'){ // domain relative url
} $item['uri'] = self::DOMAIN . $uri;
$item['content'] = $elementText->innertext; } else {
$this->items[] = $item; $item['uri'] = $this->getURI() . $uri;
}
} }
$article = $this->loadFullArticle($item['uri']);
$item['content'] = $this->replaceUriInHtmlElement($article->find('.article_content', 0));
$publicationDate = $article->find('time[itemprop=datePublished]', 0);
$short_date = $publicationDate->datetime;
$item['timestamp'] = date_parse($short_date);
} else {
// Sometimes we get rubbish, ignore.
continue;
} }
$this->items[] = $item;
} }
} }
public function getURI(){ /**
return self::URI . '/sexactu'; * Loads the full article and returns the contents
* @param $uri The article URI
* @return The article content
*/
private function loadFullArticle($uri){
$html = getSimpleHTMLDOMCached($uri);
$content = $html->find('#article', 0);
if($content){
return $content;
}
return null;
} }
private function correctCase($str){ /**
$sentences = explode('.', mb_strtolower($str, 'UTF-8')); * Replaces all relative URIs with absolute ones
$str = ''; * @param $element A simplehtmldom element
$sep = ''; * @return The $element->innertext with all URIs replaced
foreach ($sentences as $sentence){ */
//upper case first char private function replaceUriInHtmlElement($element){
$sentence = ucfirst(trim($sentence)); $returned = $element->innertext;
foreach (self::REPLACED_ATTRIBUTES as $initial => $final) {
//append sentence to output $returned = str_replace($initial.'="/', $final.'="' . self::DOMAIN . '/', $returned);
$str = $str . $sep . $sentence;
$sep = '. ';
} }
return $str; return $returned;
} }
} }