[KununuBridge] Fix broken bridge and simplify implementation

This commit is contained in:
logmanoriginal 2018-09-16 09:55:35 +02:00
parent 811e8d8c88
commit e6476a600d

View file

@ -64,7 +64,7 @@ class KununuBridge extends BridgeAbstract {
return parent::getURI(); return parent::getURI();
} }
function getName(){ public function getName(){
if(!is_null($this->getInput('company'))) { if(!is_null($this->getInput('company'))) {
$company = $this->fixCompanyName($this->getInput('company')); $company = $this->fixCompanyName($this->getInput('company'));
return ($this->companyName ?: $company) . ' - ' . self::NAME; return ($this->companyName ?: $company) . ' - ' . self::NAME;
@ -73,52 +73,67 @@ class KununuBridge extends BridgeAbstract {
return parent::getName(); return parent::getName();
} }
public function getIcon() {
return 'https://www.kununu.com/favicon-196x196.png';
}
public function collectData(){ public function collectData(){
$full = $this->getInput('full'); $full = $this->getInput('full');
// Load page // Load page
$html = getSimpleHTMLDOMCached($this->getURI()); $html = getSimpleHTMLDOM($this->getURI())
if(!$html) or returnServerError('Unable to receive data from ' . $this->getURI() . '!');
returnServerError('Unable to receive data from ' . $this->getURI() . '!');
$html = defaultLinkTo($html, static::URI);
// Update name for this request // Update name for this request
$this->companyName = $this->extractCompanyName($html); $company = $html->find('span[class="company-name"]', 0)
or returnServerError('Cannot find company name!');
$this->companyName = $company->innertext;
// Find the section with all the panels (reviews) // Find the section with all the panels (reviews)
$section = $html->find('section.kununu-scroll-element', 0); $section = $html->find('section.kununu-scroll-element', 0)
if($section === false) or returnServerError('Unable to find panel section!');
returnServerError('Unable to find panel section!');
// Find all articles (within the panels) // Find all articles (within the panels)
$articles = $section->find('article'); $articles = $section->find('article')
if($articles === false || empty($articles)) or returnServerError('Unable to find articles!');
returnServerError('Unable to find articles!');
// Go through all articles // Go through all articles
foreach($articles as $article) { foreach($articles as $article) {
$anchor = $article->find('h1.review-title a', 0)
or returnServerError('Cannot find article URI!');
$date = $article->find('meta[itemprop=dateCreated]', 0)
or returnServerError('Cannot find article date!');
$rating = $article->find('span.rating', 0)
or returnServerError('Cannot find article rating!');
$summary = $article->find('[itemprop=name]', 0)
or returnServerError('Cannot find article summary!');
$item = array(); $item = array();
$item['author'] = $this->extractArticleAuthorPosition($article); $item['author'] = $this->extractArticleAuthorPosition($article);
$item['timestamp'] = $this->extractArticleDate($article); $item['timestamp'] = strtotime($date);
$item['title'] = $this->extractArticleRating($article) $item['title'] = $rating->getAttribute('aria-label')
. ' : ' . ' : '
. $this->extractArticleSummary($article); . strip_tags($summary->innertext);
$item['uri'] = $this->extractArticleUri($article); $item['uri'] = $anchor->href;
if($full) if($full) {
$item['content'] = $this->extractFullDescription($item['uri']); $item['content'] = $this->extractFullDescription($item['uri']);
else } else {
$item['content'] = $this->extractArticleDescription($article); $item['content'] = $this->extractArticleDescription($article);
}
$this->items[] = $item; $this->items[] = $item;
}
}
/** }
* Fixes relative URLs in the given text
*/
private function fixUrl($text){
return preg_replace('/href=(\'|\")\//i', 'href="'.self::URI, $text);
} }
/* /*
@ -128,73 +143,11 @@ class KununuBridge extends BridgeAbstract {
$company = trim($company); $company = trim($company);
$company = str_replace(' ', '-', $company); $company = str_replace(' ', '-', $company);
$company = strtolower($company); $company = strtolower($company);
return $this->encodeUmlauts($company);
}
/**
* Encodes unmlauts in the given text
*/
private function encodeUmlauts($text){
$umlauts = Array('/ä/','/ö/','/ü/','/Ä/','/Ö/','/Ü/','/ß/'); $umlauts = Array('/ä/','/ö/','/ü/','/Ä/','/Ö/','/Ü/','/ß/');
$replace = Array('ae','oe','ue','Ae','Oe','Ue','ss'); $replace = Array('ae','oe','ue','Ae','Oe','Ue','ss');
return preg_replace($umlauts, $replace, $text); return preg_replace($umlauts, $replace, $company);
}
/**
* Returns the company name from the review html
*/
private function extractCompanyName($html){
$company_name = $html->find('h1[itemprop=name]', 0);
if(is_null($company_name))
returnServerError('Cannot find company name!');
return $company_name->plaintext;
}
/**
* Returns the date from a given article
*/
private function extractArticleDate($article){
// They conviniently provide a time attribute for us :)
$date = $article->find('meta[itemprop=dateCreated]', 0);
if(is_null($date))
returnServerError('Cannot find article date!');
return strtotime($date->content);
}
/**
* Returns the rating from a given article
*/
private function extractArticleRating($article){
$rating = $article->find('span.rating', 0);
if(is_null($rating))
returnServerError('Cannot find article rating!');
return $rating->getAttribute('aria-label');
}
/**
* Returns the summary from a given article
*/
private function extractArticleSummary($article){
$summary = $article->find('[itemprop=name]', 0);
if(is_null($summary))
returnServerError('Cannot find article summary!');
return strip_tags($summary->innertext);
}
/**
* Returns the URI from a given article
*/
private function extractArticleUri($article){
$anchor = $article->find('h1.review-title a', 0);
if(is_null($anchor))
returnServerError('Cannot find article URI!');
return self::URI . $anchor->href;
} }
/** /**
@ -202,9 +155,8 @@ class KununuBridge extends BridgeAbstract {
*/ */
private function extractArticleAuthorPosition($article){ private function extractArticleAuthorPosition($article){
// We need to parse the user-content manually // We need to parse the user-content manually
$user_content = $article->find('div.user-content', 0); $user_content = $article->find('div.user-content', 0)
if(is_null($user_content)) or returnServerError('Cannot find user content!');
returnServerError('Cannot find user content!');
// Go through all h2 elements to find index of required span (I know... it's stupid) // Go through all h2 elements to find index of required span (I know... it's stupid)
$author_position = 'Unknown'; $author_position = 'Unknown';
@ -222,11 +174,10 @@ class KununuBridge extends BridgeAbstract {
* Returns the description from a given article * Returns the description from a given article
*/ */
private function extractArticleDescription($article){ private function extractArticleDescription($article){
$description = $article->find('[itemprop=reviewBody]', 0); $description = $article->find('[itemprop=reviewBody]', 0)
if(is_null($description)) or returnServerError('Cannot find article description!');
returnServerError('Cannot find article description!');
return $this->fixUrl($description->innertext); return $description->innertext;
} }
/** /**
@ -234,14 +185,14 @@ class KununuBridge extends BridgeAbstract {
*/ */
private function extractFullDescription($uri){ private function extractFullDescription($uri){
// Load full article // Load full article
$html = getSimpleHTMLDOMCached($uri); $html = getSimpleHTMLDOMCached($uri)
if($html === false) or returnServerError('Could not load full description!');
returnServerError('Could not load full description!');
$html = defaultLinkTo($html, static::URI);
// Find the article // Find the article
$article = $html->find('article', 0); $article = $html->find('article', 0)
if(is_null($article)) or returnServerError('Cannot find article!');
returnServerError('Cannot find article!');
// Luckily they use the same layout for the review overview and full article pages :) // Luckily they use the same layout for the review overview and full article pages :)
return $this->extractArticleDescription($article); return $this->extractArticleDescription($article);