[GQMagazineBridge] Fix bridge (#1195)

* Fix bridge by changing the way the articles are loaded AND their titles are found
This commit is contained in:
Nicolas Delsaux 2019-06-28 19:29:32 +02:00 committed by LogMANOriginal
parent c39e642877
commit 09113c2594

View file

@ -40,6 +40,11 @@ class GQMagazineBridge extends BridgeAbstract
'data-original' => 'src'
);
const POSSIBLE_TITLES = array(
'h2',
'h3'
);
private function getDomain() {
$domain = $this->getInput('domain');
if (empty($domain))
@ -54,6 +59,17 @@ class GQMagazineBridge extends BridgeAbstract
return $this->getDomain() . '/' . $this->getInput('page');
}
private function findTitleOf($link) {
foreach (self::POSSIBLE_TITLES as $tag) {
$title = $link->find($tag, 0);
if($title !== null) {
if($title->plaintext !== null) {
return $title->plaintext;
}
}
}
}
public function collectData()
{
$html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI());
@ -62,21 +78,23 @@ class GQMagazineBridge extends BridgeAbstract
$main = $html->find('main', 0);
foreach ($main->find('a') as $link) {
$uri = $link->href;
$title = $link->find('h2', 0);
$date = $link->find('time', 0);
$item = array();
$author = $link->find('span[itemprop=name]', 0);
if($author !== null) {
$item['author'] = $author->plaintext;
$item['title'] = $title->plaintext;
if(substr($uri, 0, 1) === 'h') { // absolute uri
$item['title'] = $this->findTitleOf($link);
switch(substr($uri, 0, 1)) {
case 'h': // absolute uri
$item['uri'] = $uri;
} else if(substr($uri, 0, 1) === '/') { // domain relative url
break;
case '/': // domain relative uri
$item['uri'] = $this->getDomain() . $uri;
} else {
break;
default:
$item['uri'] = $this->getDomain() . '/' . $uri;
}
$article = $this->loadFullArticle($item['uri']);
if($article) {
$item['content'] = $this->replaceUriInHtmlElement($article);
@ -88,6 +106,7 @@ class GQMagazineBridge extends BridgeAbstract
$this->items[] = $item;
}
}
}
/**
* Loads the full article and returns the contents
@ -96,16 +115,7 @@ class GQMagazineBridge extends BridgeAbstract
*/
private function loadFullArticle($uri){
$html = getSimpleHTMLDOMCached($uri);
// Once again, that generated css classes madness is an obstacle ... which i can go over easily
foreach($html->find('div') as $div) {
// List the CSS classes of that div
$classes = $div->class;
// I can't directly lookup that class since GQ since to generate random names like "ArticleBodySection-fkggUW"
if(strpos($classes, 'ArticleBodySection') !== false) {
return $div;
}
}
return null;
return $html->find('section[data-test-id=ArticleBodyContent]', 0);
}
/**