[LWNprevBridge] full rewrite

Signed-off-by: Pierre Mazière <pierre.maziere@gmx.com>
[logmanoriginal@users.noreply.github.com: Fix coding style]
This commit is contained in:
Pierre Mazière 2017-08-03 00:15:55 +02:00 committed by logmanoriginal
parent c7ec50373a
commit 873a91259f

View file

@ -6,8 +6,10 @@ class LWNprevBridge extends BridgeAbstract{
const CACHE_TIMEOUT = 604800; // 1 week const CACHE_TIMEOUT = 604800; // 1 week
const DESCRIPTION = 'LWN Free Weekly Edition available one week late'; const DESCRIPTION = 'LWN Free Weekly Edition available one week late';
private $editionTimeStamp;
function getURI(){ function getURI(){
return self::URI . 'free/bigpage'; return self::URI.'free/bigpage';
} }
private function jumpToNextTag(&$node){ private function jumpToNextTag(&$node){
@ -36,110 +38,228 @@ class LWNprevBridge extends BridgeAbstract{
$content = getContents($this->getURI()) $content = getContents($this->getURI())
or returnServerError('No results for LWNprev'); or returnServerError('No results for LWNprev');
libxml_use_internal_errors(true); $contents = explode('<b>Page editor</b>', $content);
$html = new DOMDocument();
$html->loadHTML($content);
libxml_clear_errors();
$cat1 = ''; foreach($contents as $content) {
$cat2 = ''; if(strpos($content, '<html>') === false) {
$content = <<<EOD
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html><head><title>LWN</title></head><body>{$content}</body></html>
EOD;
} else {
$content = $content.'</body></html>';
}
foreach($html->getElementsByTagName('a') as $a) { libxml_use_internal_errors(true);
if($a->textContent === 'Multi-page format') { $html = new DOMDocument();
break; $html->loadHTML($content);
libxml_clear_errors();
$edition = $html->getElementsByTagName('h1');
if($edition->length !== 0) {
$text = $edition->item(0)->textContent;
$this->editionTimeStamp = strtotime(
substr($text, strpos($text, 'for ') + strlen('for '))
);
}
if(strpos($content, 'Cat1HL') === false) {
$items = $this->getFeatureContents($html);
} elseif(strpos($content, 'Cat3HL') === false) {
$items = $this->getBriefItems($html);
} else {
$items = $this->getAnnouncements($html);
}
$this->items = array_merge($this->items, $items);
}
}
private function getArticleContent(&$title){
$link = $title->firstChild;
$this->jumpToNextTag($link);
$item['uri'] = self::URI;
if($link->nodeName === 'a') {
$item['uri'] .= $link->getAttribute('href');
}
$item['timestamp'] = $this->editionTimeStamp;
$node = $title;
$content = '';
$contentEnd = false;
while(!$contentEnd) {
$node = $node->nextSibling;
if(!$node || (
$node->nodeType !== XML_TEXT_NODE &&
$node->nodeName === 'h2' || (
!is_null($node->attributes) &&
!is_null($class = $node->attributes->getNamedItem('class')) &&
in_array($class->nodeValue, array('Cat1HL','Cat2HL'))
)
)
) {
$contentEnd = true;
} else {
$content .= $node->C14N();
} }
} }
$realURI = self::URI . $a->getAttribute('href'); $item['content'] = $content;
$URICounter = 0; return $item;
}
$edition = $html->getElementsByTagName('h1')->item(0)->textContent; private function getFeatureContents(&$html){
$editionTimeStamp = strtotime( $items = array();
substr($edition, strpos($edition, 'for ') + strlen('for ')) foreach($html->getElementsByTagName('h2') as $title) {
); if($title->getAttribute('class') !== 'SummaryHL') {
foreach($html->getElementsByTagName('h2') as $h2) {
if($h2->getAttribute('class') !== 'SummaryHL') {
continue; continue;
} }
$item = array(); $item = array();
$h2NextSibling = $h2->nextSibling; $author = $title->nextSibling;
$this->jumpToNextTag($h2NextSibling); $this->jumpToNextTag($author);
if($author->getAttribute('class') === 'FeatureByline') {
switch($h2NextSibling->getAttribute('class')) { $item['author'] = $author->getElementsByTagName('b')->item(0)->textContent;
case 'FeatureByline': } else {
$item['author'] = $h2NextSibling->getElementsByTagName('b')->item(0)->textContent; continue;
break;
case 'GAByline':
$text = $h2NextSibling->textContent;
$item['author'] = substr($text, strpos($text, 'by '));
break;
default:
$item['author'] = 'LWN';
break;
};
$h2FirstChild = $h2->firstChild;
$this->jumpToNextTag($h2FirstChild);
if($h2FirstChild->nodeName === 'a') {
$item['uri'] = self::URI . $h2FirstChild->getAttribute('href');
} else{
$item['uri'] = $realURI . '#' . $URICounter;
} }
$URICounter++;
$item['timestamp'] = $editionTimeStamp + $URICounter; $item['title'] = $title->textContent;
$h2PrevSibling = $h2->previousSibling; $items[] = array_merge($item, $this->getArticleContent($title));
$this->jumpToPreviousTag($h2PrevSibling); }
switch($h2PrevSibling->getAttribute('class')) { return $items;
case 'Cat2HL': }
$cat2 = $h2PrevSibling->textContent;
$h2PrevSibling = $h2PrevSibling->previousSibling; private function getItemPrefix(&$cat, &$cats){
$this->jumpToPreviousTag($h2PrevSibling); $cat1 = '';
if($h2PrevSibling->getAttribute('class') !== 'Cat1HL') { $cat2 = '';
break; $cat3 = '';
} switch($cat->getAttribute('class')) {
$cat1 = $h2PrevSibling->textContent; case 'Cat3HL':
break; $cat3 = $cat->textContent;
case 'Cat1HL': $cat = $cat->previousSibling;
$cat1 = $h2PrevSibling->textContent; $this->jumpToPreviousTag($cat);
$cat2 = ''; $cats[2] = $cat3;
break; if($cat->getAttribute('class') !== 'Cat2HL') {
default:
break; break;
} }
$h2PrevSibling = null; case 'Cat2HL':
$cat2 = $cat->textContent;
$item['title'] = ''; $cat = $cat->previousSibling;
if(!empty($cat1)) { $this->jumpToPreviousTag($cat);
$item['title'] .= '[' . $cat1 . ($cat2 ? '/' . $cat2 : '') . '] '; $cats[1] = $cat2;
if(empty($cat3)) {
$cats[2] = '';
} }
$item['title'] .= $h2->textContent; if($cat->getAttribute('class') !== 'Cat1HL') {
break;
}
case 'Cat1HL':
$cat1 = $cat->textContent;
$cats[0] = $cat1;
if(empty($cat3)) {
$cats[2] = '';
}
if(empty($cat2)) {
$cats[1] = '';
}
break;
default:
break;
}
$node = $h2; $prefix = '';
if(!empty($cats[0])) {
$prefix .= '['.$cats[0].($cats[1] ? '/'.$cats[1] : '').'] ';
}
return $prefix;
}
private function getAnnouncements(&$html){
$items = array();
$cats = array('','','');
foreach($html->getElementsByTagName('p') as $newsletters) {
if($newsletters->getAttribute('class') !== 'Cat3HL') {
continue;
}
$item = array();
$item['uri'] = self::URI.'#'.microtime(true);
$item['timestamp'] = $this->editionTimeStamp;//+$URICounter;
$item['author'] = 'LWN';
$cat = $newsletters->previousSibling;
$this->jumpToPreviousTag($cat);
$prefix = $this->getItemPrefix($cat, $cats);
$item['title'] = $prefix.' '.$newsletters->textContent;
$node = $newsletters;
$content = ''; $content = '';
$contentEnd = false; $contentEnd = false;
while(!$contentEnd) { while(!$contentEnd) {
$node = $node->nextSibling; $node = $node->nextSibling;
if(!$node || ( if(!$node || (
$node->nodeType !== XML_TEXT_NODE && ( $node->nodeType !== XML_TEXT_NODE && (
$node->nodeName === 'h2' || ( !is_null($node->attributes) &&
!is_null($node->attributes) && !is_null($class = $node->attributes->getNamedItem('class')) &&
!is_null($class = $node->attributes->getNamedItem('class')) && in_array($class->nodeValue, array('Cat1HL','Cat2HL','Cat3HL'))
in_array($class->nodeValue, array('Cat1HL', 'Cat2HL'))
)
) )
) )
) { ) {
$contentEnd = true; $contentEnd = true;
} else{ } else {
$content .= $node->C14N(); $content .= $node->C14N();
} }
} }
$item['content'] = $content; $item['content'] = $content;
$this->items[] = $item; $items[] = $item;
} }
foreach($html->getElementsByTagName('h2') as $title) {
if($title->getAttribute('class') !== 'SummaryHL') {
continue;
}
$item = array();
$cat = $title->previousSibling;
$this->jumpToPreviousTag($cat);
$cat = $cat->previousSibling;
$this->jumpToPreviousTag($cat);
$prefix = $this->getItemPrefix($cat, $cats);
$item['title'] = $prefix.' '.$title->textContent;
$items[] = array_merge($item, $this->getArticleContent($title));
}
return $items;
}
private function getBriefItems(&$html){
$items = array();
$cats = array('','','');
foreach($html->getElementsByTagName('h2') as $title) {
if($title->getAttribute('class') !== 'SummaryHL') {
continue;
}
$item = array();
$cat = $title->previousSibling;
$this->jumpToPreviousTag($cat);
$cat = $cat->previousSibling;
$this->jumpToPreviousTag($cat);
$prefix = $this->getItemPrefix($cat, $cats);
$item['title'] = $prefix.' '.$title->textContent;
$items[] = array_merge($item, $this->getArticleContent($title));
}
return $items;
} }
} }
?>