[GQMagazineBridge] Fix bridge (#1195)
* Fix bridge by changing the way the articles are loaded AND their titles are found
This commit is contained in:
parent
c39e642877
commit
09113c2594
1 changed files with 39 additions and 29 deletions
|
@ -40,6 +40,11 @@ class GQMagazineBridge extends BridgeAbstract
|
||||||
'data-original' => 'src'
|
'data-original' => 'src'
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const POSSIBLE_TITLES = array(
|
||||||
|
'h2',
|
||||||
|
'h3'
|
||||||
|
);
|
||||||
|
|
||||||
private function getDomain() {
|
private function getDomain() {
|
||||||
$domain = $this->getInput('domain');
|
$domain = $this->getInput('domain');
|
||||||
if (empty($domain))
|
if (empty($domain))
|
||||||
|
@ -54,6 +59,17 @@ class GQMagazineBridge extends BridgeAbstract
|
||||||
return $this->getDomain() . '/' . $this->getInput('page');
|
return $this->getDomain() . '/' . $this->getInput('page');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private function findTitleOf($link) {
|
||||||
|
foreach (self::POSSIBLE_TITLES as $tag) {
|
||||||
|
$title = $link->find($tag, 0);
|
||||||
|
if($title !== null) {
|
||||||
|
if($title->plaintext !== null) {
|
||||||
|
return $title->plaintext;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public function collectData()
|
public function collectData()
|
||||||
{
|
{
|
||||||
$html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI());
|
$html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI());
|
||||||
|
@ -62,30 +78,33 @@ class GQMagazineBridge extends BridgeAbstract
|
||||||
$main = $html->find('main', 0);
|
$main = $html->find('main', 0);
|
||||||
foreach ($main->find('a') as $link) {
|
foreach ($main->find('a') as $link) {
|
||||||
$uri = $link->href;
|
$uri = $link->href;
|
||||||
$title = $link->find('h2', 0);
|
|
||||||
$date = $link->find('time', 0);
|
$date = $link->find('time', 0);
|
||||||
|
|
||||||
$item = array();
|
$item = array();
|
||||||
$author = $link->find('span[itemprop=name]', 0);
|
$author = $link->find('span[itemprop=name]', 0);
|
||||||
$item['author'] = $author->plaintext;
|
if($author !== null) {
|
||||||
$item['title'] = $title->plaintext;
|
$item['author'] = $author->plaintext;
|
||||||
if(substr($uri, 0, 1) === 'h') { // absolute uri
|
$item['title'] = $this->findTitleOf($link);
|
||||||
$item['uri'] = $uri;
|
switch(substr($uri, 0, 1)) {
|
||||||
} else if(substr($uri, 0, 1) === '/') { // domain relative url
|
case 'h': // absolute uri
|
||||||
$item['uri'] = $this->getDomain() . $uri;
|
$item['uri'] = $uri;
|
||||||
} else {
|
break;
|
||||||
$item['uri'] = $this->getDomain() . '/' . $uri;
|
case '/': // domain relative uri
|
||||||
|
$item['uri'] = $this->getDomain() . $uri;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
$item['uri'] = $this->getDomain() . '/' . $uri;
|
||||||
|
}
|
||||||
|
$article = $this->loadFullArticle($item['uri']);
|
||||||
|
if($article) {
|
||||||
|
$item['content'] = $this->replaceUriInHtmlElement($article);
|
||||||
|
} else {
|
||||||
|
$item['content'] = "<strong>Article body couldn't be loaded</strong>. It must be a bug!";
|
||||||
|
}
|
||||||
|
$short_date = $date->datetime;
|
||||||
|
$item['timestamp'] = strtotime($short_date);
|
||||||
|
$this->items[] = $item;
|
||||||
}
|
}
|
||||||
|
|
||||||
$article = $this->loadFullArticle($item['uri']);
|
|
||||||
if($article) {
|
|
||||||
$item['content'] = $this->replaceUriInHtmlElement($article);
|
|
||||||
} else {
|
|
||||||
$item['content'] = "<strong>Article body couldn't be loaded</strong>. It must be a bug!";
|
|
||||||
}
|
|
||||||
$short_date = $date->datetime;
|
|
||||||
$item['timestamp'] = strtotime($short_date);
|
|
||||||
$this->items[] = $item;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -96,16 +115,7 @@ class GQMagazineBridge extends BridgeAbstract
|
||||||
*/
|
*/
|
||||||
private function loadFullArticle($uri){
|
private function loadFullArticle($uri){
|
||||||
$html = getSimpleHTMLDOMCached($uri);
|
$html = getSimpleHTMLDOMCached($uri);
|
||||||
// Once again, that generated css classes madness is an obstacle ... which i can go over easily
|
return $html->find('section[data-test-id=ArticleBodyContent]', 0);
|
||||||
foreach($html->find('div') as $div) {
|
|
||||||
// List the CSS classes of that div
|
|
||||||
$classes = $div->class;
|
|
||||||
// I can't directly lookup that class since GQ since to generate random names like "ArticleBodySection-fkggUW"
|
|
||||||
if(strpos($classes, 'ArticleBodySection') !== false) {
|
|
||||||
return $div;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in a new issue