add bridge for LWN Free Weekly Edition

Signed-off-by: Pierre Mazière <pierre.maziere@gmx.com>
This commit is contained in:
Pierre Mazière 2016-06-19 00:41:02 +02:00
parent 040f4da73d
commit dc6928316b

156
bridges/LWNprevBridge.php Normal file
View file

@ -0,0 +1,156 @@
<?php
/**
* RssBridgeLWNprev
*
* @name LWNPrev Bridge
* @description Returns the articles from the previous LWN.net edition
*/
class LWNprevBridge extends BridgeAbstract{
public function loadMetadatas() {
$this->maintainer = 'Pierre Mazière';
$this->name = 'LWN Free Weekly Edition';
$this->uri = 'https://lwn.net/free/bigpage';
$this->description = 'LWN Free Weekly Edition available one week late';
$this->update = '2016-19-01';
}
private function jumpToNextTag(&$node){
while($node && $node->nodeType===XML_TEXT_NODE){
$nextNode=$node->nextSibling;
if(!$nextNode){
break;
}
$node=$nextNode;
}
}
private function jumpToPreviousTag(&$node){
while($node && $node->nodeType===XML_TEXT_NODE){
$previousNode=$node->previousSibling;
if(!$previousNode){
break;
}
$node=$previousNode;
}
}
public function collectData(array $param){
// Because the LWN page is written in loose HTML and not XHTML,
// Simple HTML Dom is not accurate enough for the job
$html = advanced_file_get_contents('https://lwn.net/free/bigpage')
or $this->returnError('No results for LWNprev', 404);
libxml_use_internal_errors(true);
$html=DOMDocument::loadHTML($html);
libxml_clear_errors();
$cat1='';
$cat2='';
$realURI='https://lwn.net';
foreach($html->getElementsByTagName('a') as $a){
if($a->textContent==='Multi-page format'){
break;
}
}
$realURI.=$a->getAttribute('href');
$URICounter=0;
$edition=$html->getElementsByTagName('h1')->item(0)->textContent;
$editionTimeStamp=strtotime(
substr($edition,strpos($edition,'for ')+strlen('for '))
);
foreach($html->getElementsByTagName('h2') as $h2){
if($h2->getAttribute('class')!=='SummaryHL'){
continue;
}
$item = new \Item();
$h2NextSibling=$h2->nextSibling;
$this->jumpToNextTag($h2NextSibling);
switch($h2NextSibling->getAttribute('class')){
case 'FeatureByline':
$item->name=$h2NextSibling->getElementsByTagName('b')->item(0)->textContent;
break;
case 'GAByline':
$text=$h2NextSibling->textContent;
$item->name=substr($text,strpos($text,'by '));
break;
default:
$item->name='LWN';
break;
};
$h2FirstChild=$h2->firstChild;
$this->jumpToNextTag($h2FirstChild);
if($h2FirstChild->tagName==='a'){
$item->uri='https://lwn.net'.$h2FirstChild->getAttribute('href');
}else{
$item->uri=$realURI.'#'.$URICounter;
}
$URICounter++;
$item->timestamp=$editionTimeStamp+$URICounter;
$h2PrevSibling=$h2->previousSibling;
$this->jumpToPreviousTag($h2PrevSibling);
switch($h2PrevSibling->getAttribute('class')){
case 'Cat2HL':
$cat2=$h2PrevSibling->textContent;
$h2PrevSibling=$h2PrevSibling->previousSibling;
$this->jumpToPreviousTag($h2PrevSibling);
if($h2PrevSibling->getAttribute('class')!=='Cat1HL'){
break;
}
$cat1=$h2PrevSibling->textContent;
break;
case 'Cat1HL':
$cat1=$h2PrevSibling->textContent;
$cat2='';
break;
default:
break;
}
$h2PrevSibling=null;
$item->title='['.$cat1.($cat2?'/'.$cat2:'').'] '.$h2->textContent;
$node=$h2;
$content='';
$contentEnd=false;
while(!$contentEnd){
$node=$node->nextSibling;
if(
!$node || (
$node->nodeType!==XML_TEXT_NODE && (
$node->tagName==='h2' ||
in_array($node->getAttribute('class'),array('Cat1HL','Cat2HL'))
)
)
){
$contentEnd=true;
}else{
$content.=$node->C14N();
}
}
$item->content=$content;
$this->items[]=$item;
}
}
public function getName(){
return 'LWN Free Weekly Edition';
}
public function getURI(){
return 'https://lwn.net/free/bigpage';
}
public function getCacheDuration(){
return 604800; // one week
}
}