diff --git a/bridges/LWNprevBridge.php b/bridges/LWNprevBridge.php new file mode 100644 index 00000000..9772799f --- /dev/null +++ b/bridges/LWNprevBridge.php @@ -0,0 +1,174 @@ +maintainer = 'Pierre Mazière'; + $this->name = 'LWN Free Weekly Edition'; + $this->uri = 'https://lwn.net/free/bigpage'; + $this->description = 'LWN Free Weekly Edition available one week late'; + $this->update = '2016-19-01'; + + } + + private function jumpToNextTag(&$node){ + while($node && $node->nodeType===XML_TEXT_NODE){ + $nextNode=$node->nextSibling; + if(!$nextNode){ + break; + } + $node=$nextNode; + } + } + + private function jumpToPreviousTag(&$node){ + while($node && $node->nodeType===XML_TEXT_NODE){ + $previousNode=$node->previousSibling; + if(!$previousNode){ + break; + } + $node=$previousNode; + } + } + + public function collectData(array $param){ + // Because the LWN page is written in loose HTML and not XHTML, + // Simple HTML Dom is not accurate enough for the job + + $uri='https://lwn.net/free/bigpage'; + $context=null; + if(defined('PROXY_URL')) { + $context = array( + 'http' => array( + 'proxy' => PROXY_URL, + 'request_fulluri' => true, + ), + ); + $context = stream_context_create($context); + } + + $html=file_get_contents($uri, false, $context) + or $this->returnError('No results for LWNprev', 404); + + libxml_use_internal_errors(true); + $html=DOMDocument::loadHTML($html); + libxml_clear_errors(); + + $cat1=''; + $cat2=''; + + $realURI='https://lwn.net'; + foreach($html->getElementsByTagName('a') as $a){ + if($a->textContent==='Multi-page format'){ + break; + } + } + $realURI.=$a->getAttribute('href'); + $URICounter=0; + + $edition=$html->getElementsByTagName('h1')->item(0)->textContent; + $editionTimeStamp=strtotime( + substr($edition,strpos($edition,'for ')+strlen('for ')) + ); + + foreach($html->getElementsByTagName('h2') as $h2){ + if($h2->getAttribute('class')!=='SummaryHL'){ + continue; + } + + $item = new \Item(); + + $h2NextSibling=$h2->nextSibling; + $this->jumpToNextTag($h2NextSibling); + + switch($h2NextSibling->getAttribute('class')){ + case 'FeatureByline': + $item->name=$h2NextSibling->getElementsByTagName('b')->item(0)->textContent; + break; + case 'GAByline': + $text=$h2NextSibling->textContent; + $item->name=substr($text,strpos($text,'by ')); + break; + default: + $item->name='LWN'; + break; + }; + + $h2FirstChild=$h2->firstChild; + $this->jumpToNextTag($h2FirstChild); + if($h2FirstChild->tagName==='a'){ + $item->uri='https://lwn.net'.$h2FirstChild->getAttribute('href'); + }else{ + $item->uri=$realURI.'#'.$URICounter; + } + $URICounter++; + + $item->timestamp=$editionTimeStamp+$URICounter; + + $h2PrevSibling=$h2->previousSibling; + $this->jumpToPreviousTag($h2PrevSibling); + switch($h2PrevSibling->getAttribute('class')){ + case 'Cat2HL': + $cat2=$h2PrevSibling->textContent; + $h2PrevSibling=$h2PrevSibling->previousSibling; + $this->jumpToPreviousTag($h2PrevSibling); + if($h2PrevSibling->getAttribute('class')!=='Cat1HL'){ + break; + } + $cat1=$h2PrevSibling->textContent; + break; + case 'Cat1HL': + $cat1=$h2PrevSibling->textContent; + $cat2=''; + break; + default: + break; + } + $h2PrevSibling=null; + + $item->title=''; + if(!empty($cat1)){ + $item->title.='['.$cat1.($cat2?'/'.$cat2:'').'] '; + } + $item->title.=$h2->textContent; + + $node=$h2; + $content=''; + $contentEnd=false; + while(!$contentEnd){ + $node=$node->nextSibling; + if( + !$node || ( + $node->nodeType!==XML_TEXT_NODE && ( + $node->tagName==='h2' || + in_array($node->getAttribute('class'),array('Cat1HL','Cat2HL')) + ) + ) + ){ + $contentEnd=true; + }else{ + $content.=$node->C14N(); + } + } + $item->content=$content; + $this->items[]=$item; + } + } + + public function getName(){ + return 'LWN Free Weekly Edition'; + } + + public function getURI(){ + return 'https://lwn.net/free/bigpage'; + } + + public function getCacheDuration(){ + return 604800; // one week + } +}