Rss-Bridge/bridges/WikipediaBridge.php
Pierre Mazière 6e2c7ceaf4 bridges: rename file_get_html to getSimpleHTMLDOM
Signed-off-by: Pierre Mazière <pierre.maziere@gmx.com>
2016-08-19 10:20:27 +02:00

304 lines
8.7 KiB
PHP

<?php
define('WIKIPEDIA_SUBJECT_TFA', 0); // Today's featured article
define('WIKIPEDIA_SUBJECT_DYK', 1); // Did you know...
class WikipediaBridge extends BridgeAbstract{
public function loadMetadatas(){
$this->maintainer = 'logmanoriginal';
$this->name = 'Wikipedia bridge for many languages';
$this->uri = 'https://www.wikipedia.org/';
$this->description = 'Returns articles for a language of your choice';
$this->update = '2016-08-17';
$this->parameters[] =
'[
{
"name": "Language",
"identifier": "language",
"type": "list",
"required": true,
"title": "Select your language",
"exampleValue": "English",
"values": [
{
"name": "English",
"value": "en"
},
{
"name": "German",
"value": "de"
},
{
"name": "French",
"value": "fr"
},
{
"name": "Esperanto",
"value": "eo"
}
]
},
{
"name": "Subject",
"identifier": "subject",
"type": "list",
"required": true,
"title": "What subject are you interested in?",
"exampleValue": "Today\'s featured article",
"values": [
{
"name": "Today\'s featured article",
"value": "tfa"
},
{
"name": "Did you know...",
"value": "dyk"
}
]
},
{
"name": "Load full article",
"identifier": "fullarticle",
"type": "checkbox",
"required": false,
"title": "Activate to always load the full article",
"exampleValue": "false"
}
]';
}
public function collectData(array $params){
if(!isset($params['language']))
$this->returnClientError('You must specify a valid language via \'&language=\'!');
if(!$this->CheckLanguageCode(strtolower($params['language'])))
$this->returnClientError('The language code you provided (\'' . $params['language'] . '\') is not supported!');
if(!isset($params['subject']))
$this->returnClientError('You must specify a valid subject via \'&subject=\'!');
$subject = WIKIPEDIA_SUBJECT_TFA;
switch($params['subject']){
case 'tfa':
$subject = WIKIPEDIA_SUBJECT_TFA;
break;
case 'dyk':
$subject = WIKIPEDIA_SUBJECT_DYK;
break;
default:
$subject = WIKIPEDIA_SUBJECT_TFA;
break;
}
$fullArticle = false;
if(isset($params['fullarticle']))
$fullArticle = $params['fullarticle'] === 'on' ? true : false;
// We store the correct URI as URI of this bridge (so it can be used later!)
$this->uri = 'https://' . strtolower($params['language']) . '.wikipedia.org';
// While we at it let's also update the name for the feed
switch($subject){
case WIKIPEDIA_SUBJECT_TFA:
$this->name = 'Today\'s featured article from ' . strtolower($params['language']) . '.wikipedia.org';
break;
case WIKIPEDIA_SUBJECT_DYK:
$this->name = 'Did you know? - articles from ' . strtolower($params['language']) . '.wikipedia.org';
break;
default:
$this->name = 'Articles from ' . strtolower($params['language']) . '.wikipedia.org';
break;
}
// This will automatically send us to the correct main page in any language (try it!)
$html = $this->getSimpleHTMLDOM($this->uri . '/wiki');
if(!$html)
$this->returnServerError('Could not load site: ' . $this->uri . '!');
/*
* Now read content depending on the language (make sure to create one function per language!)
* We build the function name automatically, just make sure you create a private function ending
* with your desired language code, where the language code is upper case! (en -> GetContentsEN).
*/
$function = 'GetContents' . strtoupper($params['language']);
if(!method_exists($this, $function))
$this->returnServerError('A function to get the contents for your langauage is missing (\'' . $function . '\')!');
/*
* The method takes care of creating all items.
*/
$this->$function($html, $subject, $fullArticle);
}
/**
* Returns true if the language code is part of the parameters list
*/
private function CheckLanguageCode($languageCode){
$parameter = json_decode($this->parameters[0], true);
$languages = $parameter[0]['values'];
$language_names = array();
foreach($languages as $language)
$language_names[] = $language['value'];
return in_array($languageCode, $language_names);
}
/**
* Replaces all relative URIs with absolute ones
* @param $element A simplehtmldom element
* @return The $element->innertext with all URIs replaced
*/
private function ReplaceURIInHTMLElement($element){
return str_replace('href="/', 'href="' . $this->uri . '/', $element->innertext);
}
/*
* Adds a new item to $items using a generic operation (should work for most (all?) wikis)
*/
private function AddTodaysFeaturedArticleGeneric($element, $fullArticle){
// Clean the bottom of the featured article
$element->find('div', -1)->outertext = '';
// The title and URI of the article is best defined in an anchor containint the string '...' ('full article ...')
$target = $element->find('p/a', 0); // We'll use the first anchor as fallback
foreach($element->find('//a') as $anchor){
if(strpos($anchor->innertext, '...') !== false){
$target = $anchor;
break;
}
}
$item = new \Item();
$item->uri = $this->uri . $target->href;
$item->title = $target->title;
if(!$fullArticle)
$item->content = strip_tags($this->ReplaceURIInHTMLElement($element), '<a><p><br><img>');
else
$item->content = $this->LoadFullArticle($item->uri);
$this->items[] = $item;
}
/*
* Adds a new item to $items using a generic operation (should work for most (all?) wikis)
*/
private function AddDidYouKnowGeneric($element, $fullArticle){
foreach($element->find('ul', 0)->find('li') as $entry){
$item = new \Item();
// We can only use the first anchor, there is no way of finding the 'correct' one if there are multiple
$item->uri = $this->uri . $entry->find('a', 0)->href;
$item->title = strip_tags($entry->innertext);
if(!$fullArticle)
$item->content = $this->ReplaceURIInHTMLElement($entry);
else
$item->content = $this->LoadFullArticle($item->uri);
$this->items[] = $item;
}
}
/**
* Loads the full article from a given URI
*/
private function LoadFullArticle($uri){
$content_html = $this->getSimpleHTMLDOM($uri);
if(!$content_html)
$this->returnServerError('Could not load site: ' . $uri . '!');
$content = $content_html->find('#mw-content-text', 0);
if(!$content)
$this->returnServerError('Could not find content in page: ' . $uri . '!');
// Let's remove a couple of things from the article
$table = $content->find('#toc', 0); // Table of contents
if(!$table === false)
$table->outertext = '';
foreach($content->find('ol.references') as $reference) // References
$reference->outertext = '';
return str_replace('href="/', 'href="' . $this->uri . '/', $content->innertext);
}
/**
* Implementation for de.wikipedia.org
*/
private function GetContentsDE($html, $subject, $fullArticle){
switch($subject){
case WIKIPEDIA_SUBJECT_TFA:
$element = $html->find('div[id=mf-tfa]', 0);
$this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
break;
case WIKIPEDIA_SUBJECT_DYK:
$element = $html->find('div[id=mf-dyk]', 0);
$this->AddDidYouKnowGeneric($element, $fullArticle);
break;
default:
break;
}
}
/**
* Implementation for fr.wikipedia.org
*/
private function GetContentsFR($html, $subject, $fullArticle){
switch($subject){
case WIKIPEDIA_SUBJECT_TFA:
$element = $html->find('div[id=accueil-lumieresur]', 0);
$this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
break;
case WIKIPEDIA_SUBJECT_DYK:
$element = $html->find('div[id=SaviezVous]', 0);
$this->AddDidYouKnowGeneric($element, $fullArticle);
break;
default:
break;
}
}
/**
* Implementation for en.wikipedia.org
*/
private function GetContentsEN($html, $subject, $fullArticle){
switch($subject){
case WIKIPEDIA_SUBJECT_TFA:
$element = $html->find('div[id=mp-tfa]', 0);
$this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
break;
case WIKIPEDIA_SUBJECT_DYK:
$element = $html->find('div[id=mp-dyk]', 0);
$this->AddDidYouKnowGeneric($element, $fullArticle);
break;
default:
break;
}
}
/**
* Implementation for eo.wikipedia.org
*/
private function GetContentsEO($html, $subject, $fullArticle){
switch($subject){
case WIKIPEDIA_SUBJECT_TFA:
$element = $html->find('div[id=mf-artikolo-de-la-semajno]', 0);
$this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
break;
case WIKIPEDIA_SUBJECT_DYK:
$element = $html->find('div[id=mw-content-text]', 0)->find('table', 4)->find('td', 4);
$this->AddDidYouKnowGeneric($element, $fullArticle);
break;
default:
break;
}
}
}