Rss-Bridge/bridges/GooglePlusPostBridge.php
logmanoriginal 3276d4e3d5 [GooglePlusPostBridge] Fix content loading
- Do not force language via HTTP header
The header enforced the language to be french which caused problems parsing
the exact time due to spellings (strtotime cannot work with 'semaines'). If
further issues are experienced try forcing en-us instead.
=> This should really be done in the RSS-Bridge core

- Fix loading problems due to pinned articles
Pinned articles do not provide a timestamp. Building the timestamp step-by-step
solves parsing errors.

- Use class names instead of CSS paths
CSS paths change based on the article. Pinned articles provide a different
DOM structure which caused parsing errors.

Reported via #499
2017-03-26 16:41:20 +02:00

104 lines
3.2 KiB
PHP

<?php
class GooglePlusPostBridge extends BridgeAbstract{
protected $_title;
protected $_url;
const MAINTAINER = 'Grummfy';
const NAME = 'Google Plus Post Bridge';
const URI = 'https://plus.google.com/';
const CACHE_TIMEOUT = 600; //10min
const DESCRIPTION = 'Returns user public post (without API).';
const PARAMETERS = array( array(
'username' => array(
'name' => 'username or Id',
'required' => true
)
));
public function collectData(){
// get content parsed
$html = getSimpleHTMLDOMCached(self::URI . urlencode($this->getInput('username')) . '/posts')
or returnServerError('No results for this query.');
// get title, url, ... there is a lot of intresting stuff in meta
$this->_title = $html->find('meta[property=og:title]', 0)->getAttribute('content');
$this->_url = $html->find('meta[property=og:url]', 0)->getAttribute('content');
// I don't even know where to start with this discusting html...
foreach($html->find('div[jsname=WsjYwc]') as $post){
$item = array();
$item['author'] = $item['fullname'] = $post->find('div div div div a', 0)->innertext;
$item['id'] = $post->find('div div div', 0)->getAttribute('id');
$item['avatar'] = $post->find('div img', 0)->src;
$item['uri'] = self::URI . $post->find('div div div a', 1)->href;
$timestamp = $post->find('a.qXj2He span', 0);
if($timestamp){
$item['timestamp'] = strtotime('+' . preg_replace(
'/[^0-9A-Za-z]/',
'',
$timestamp->getAttribute('aria-label')));
}
// hashtag to treat : https://plus.google.com/explore/tag
// $hashtags = array();
// foreach($post->find('a.d-s') as $hashtag){
// $hashtags[trim($hashtag->plaintext)] = self::URI . $hashtag->href;
// }
$item['content'] = '';
// avatar display
$item['content'] .= '<div style="float:left; margin: 0 0.5em 0.5em 0;"><a href="'
. self::URI
. urlencode($this->getInput('username'));
$item['content'] .= '"><img align="top" alt="'
. $item['author']
. '" src="'
. $item['avatar']
. '" /></a></div>';
$content = $post->find('div[jsname=EjRJtf]', 0);
// extract plaintext
$item['content_simple'] = $content->plaintext;
$item['title'] = substr($item['content_simple'], 0, 72) . '...';
// XXX ugly but I don't have any idea how to do a better stuff,
// str_replace on link doesn't work as expected and ask too many checks
foreach($content->find('a') as $link){
$hasHttp = strpos($link->href, 'http');
$hasDoubleSlash = strpos($link->href, '//');
if((!$hasHttp && !$hasDoubleSlash)
|| (false !== $hasHttp && strpos($link->href, 'http') != 0)
|| (false === $hasHttp && false !== $hasDoubleSlash && $hasDoubleSlash != 0)){
// skipp bad link, for some hashtag or other stuff
if(strpos($link->href, '/') == 0){
$link->href = substr($link->href, 1);
}
$link->href = self::URI . $link->href;
}
}
$content = $content->innertext;
$item['content'] .= '<div style="margin-top: -1.5em">' . $content . '</div>';
$item['content'] = trim(strip_tags($item['content'], '<a><p><div><img>'));
$this->items[] = $item;
}
}
public function getName(){
return $this->_title ?: 'Google Plus Post Bridge';
}
public function getURI(){
return $this->_url ?: parent::getURI();
}
}