nodeType === XML_TEXT_NODE) { $nextNode = $node->nextSibling; if(!$nextNode) { break; } $node = $nextNode; } } private function jumpToPreviousTag(&$node){ while($node && $node->nodeType === XML_TEXT_NODE) { $previousNode = $node->previousSibling; if(!$previousNode) { break; } $node = $previousNode; } } public function collectData(){ // Because the LWN page is written in loose HTML and not XHTML, // Simple HTML Dom is not accurate enough for the job $content = getContents($this->getURI()) or returnServerError('No results for LWNprev'); $contents = explode('Page editor', $content); foreach($contents as $content) { if(strpos($content, '') === false) { $content = << LWN{$content} EOD; } else { $content = $content . ''; } libxml_use_internal_errors(true); $html = new DOMDocument(); $html->loadHTML($content); libxml_clear_errors(); $edition = $html->getElementsByTagName('h1'); if($edition->length !== 0) { $text = $edition->item(0)->textContent; $this->editionTimeStamp = strtotime( substr($text, strpos($text, 'for ') + strlen('for ')) ); } if(strpos($content, 'Cat1HL') === false) { $items = $this->getFeatureContents($html); } elseif(strpos($content, 'Cat3HL') === false) { $items = $this->getBriefItems($html); } else { $items = $this->getAnnouncements($html); } $this->items = array_merge($this->items, $items); } } private function getArticleContent(&$title){ $link = $title->firstChild; $this->jumpToNextTag($link); $item['uri'] = self::URI; if($link->nodeName === 'a') { $item['uri'] .= $link->getAttribute('href'); } $item['timestamp'] = $this->editionTimeStamp; $node = $title; $content = ''; $contentEnd = false; while(!$contentEnd) { $node = $node->nextSibling; if(!$node || ( $node->nodeType !== XML_TEXT_NODE && $node->nodeName === 'h2' || ( !is_null($node->attributes) && !is_null($class = $node->attributes->getNamedItem('class')) && in_array($class->nodeValue, array('Cat1HL','Cat2HL')) ) ) ) { $contentEnd = true; } else { $content .= $node->C14N(); } } $item['content'] = $content; return $item; } private function getFeatureContents(&$html){ $items = array(); foreach($html->getElementsByTagName('h2') as $title) { if($title->getAttribute('class') !== 'SummaryHL') { continue; } $item = array(); $author = $title->nextSibling; $this->jumpToNextTag($author); if($author->getAttribute('class') === 'FeatureByline') { $item['author'] = $author->getElementsByTagName('b')->item(0)->textContent; } else { continue; } $item['title'] = $title->textContent; $items[] = array_merge($item, $this->getArticleContent($title)); } return $items; } private function getItemPrefix(&$cat, &$cats){ $cat1 = ''; $cat2 = ''; $cat3 = ''; switch($cat->getAttribute('class')) { case 'Cat3HL': $cat3 = $cat->textContent; $cat = $cat->previousSibling; $this->jumpToPreviousTag($cat); $cats[2] = $cat3; if($cat->getAttribute('class') !== 'Cat2HL') { break; } case 'Cat2HL': $cat2 = $cat->textContent; $cat = $cat->previousSibling; $this->jumpToPreviousTag($cat); $cats[1] = $cat2; if(empty($cat3)) { $cats[2] = ''; } if($cat->getAttribute('class') !== 'Cat1HL') { break; } case 'Cat1HL': $cat1 = $cat->textContent; $cats[0] = $cat1; if(empty($cat3)) { $cats[2] = ''; } if(empty($cat2)) { $cats[1] = ''; } break; default: break; } $prefix = ''; if(!empty($cats[0])) { $prefix .= '[' . $cats[0] . ($cats[1] ? '/' . $cats[1] : '') . '] '; } return $prefix; } private function getAnnouncements(&$html){ $items = array(); $cats = array('','',''); foreach($html->getElementsByTagName('p') as $newsletters) { if($newsletters->getAttribute('class') !== 'Cat3HL') { continue; } $item = array(); $item['uri'] = self::URI . '#' . count($items); $item['timestamp'] = $this->editionTimeStamp; $item['author'] = 'LWN'; $cat = $newsletters->previousSibling; $this->jumpToPreviousTag($cat); $prefix = $this->getItemPrefix($cat, $cats); $item['title'] = $prefix . ' ' . $newsletters->textContent; $node = $newsletters; $content = ''; $contentEnd = false; while(!$contentEnd) { $node = $node->nextSibling; if(!$node || ( $node->nodeType !== XML_TEXT_NODE && ( !is_null($node->attributes) && !is_null($class = $node->attributes->getNamedItem('class')) && in_array($class->nodeValue, array('Cat1HL','Cat2HL','Cat3HL')) ) ) ) { $contentEnd = true; } else { $content .= $node->C14N(); } } $item['content'] = $content; $items[] = $item; } foreach($html->getElementsByTagName('h2') as $title) { if($title->getAttribute('class') !== 'SummaryHL') { continue; } $item = array(); $cat = $title->previousSibling; $this->jumpToPreviousTag($cat); $cat = $cat->previousSibling; $this->jumpToPreviousTag($cat); $prefix = $this->getItemPrefix($cat, $cats); $item['title'] = $prefix . ' ' . $title->textContent; $items[] = array_merge($item, $this->getArticleContent($title)); } return $items; } private function getBriefItems(&$html){ $items = array(); $cats = array('','',''); foreach($html->getElementsByTagName('h2') as $title) { if($title->getAttribute('class') !== 'SummaryHL') { continue; } $item = array(); $cat = $title->previousSibling; $this->jumpToPreviousTag($cat); $cat = $cat->previousSibling; $this->jumpToPreviousTag($cat); $prefix = $this->getItemPrefix($cat, $cats); $item['title'] = $prefix . ' ' . $title->textContent; $items[] = array_merge($item, $this->getArticleContent($title)); } return $items; } } ?>