Adding a meta-bridge for all Gawker publications (lifehacker, kotaku, you name it, ...)
This commit is contained in:
parent
1a12f48e2e
commit
fd71ceae82
2 changed files with 123 additions and 13 deletions
89
bridges/Gawker.php
Normal file
89
bridges/Gawker.php
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
<?php
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @name Gawker media
|
||||||
|
* @description A bridge allowing access to any of the numerous Gawker media blogs (Lifehacker, deadspin, Kotaku, Jezebel, and so on
|
||||||
|
* @update 27/03/2014
|
||||||
|
* @use1(site="site")
|
||||||
|
*/
|
||||||
|
class Gawker extends HttpCachingBridgeAbstract{
|
||||||
|
private $uri;
|
||||||
|
private $name;
|
||||||
|
|
||||||
|
public function collectData(array $param){
|
||||||
|
if (empty($param['site'])) {
|
||||||
|
trigger_error("If no site is provided, nothing is gonna happen", E_USER_ERROR);
|
||||||
|
} else {
|
||||||
|
$this->uri = $param['site'];
|
||||||
|
}
|
||||||
|
$html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404);
|
||||||
|
$this->message("loaded HTML from ".$this->getURI());
|
||||||
|
// customize name
|
||||||
|
$this->name = $html->find('title', 0)->innertext;
|
||||||
|
foreach($html->find('.main-column') as $content) {
|
||||||
|
$this->parseContent($content);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function parseContent($content) {
|
||||||
|
foreach($content->find('.headline') as $headline) {
|
||||||
|
foreach($headline->find('a') as $articleLink) {
|
||||||
|
// notice we only use article from this gawker site (as gawker like to see us visit other sites)
|
||||||
|
if(strpos($articleLink->href, $this->getURI())>=0) {
|
||||||
|
$this->parseLink($articleLink);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function parseLink($infoLink) {
|
||||||
|
$item = new Item();
|
||||||
|
$item->uri = $infoLink->href;
|
||||||
|
$item->title = $infoLink->innertext;
|
||||||
|
try {
|
||||||
|
// now load that uri from cache
|
||||||
|
// $this->message("loading page ".$item->uri);
|
||||||
|
$articlePage = str_get_html($this->get_cached($item->uri));
|
||||||
|
if(is_object($articlePage)) {
|
||||||
|
$content = $articlePage->find('.post-content', 0);
|
||||||
|
$this->defaultImageSrcTo($content, $this->getURI());
|
||||||
|
$item->content = $content->innertext;
|
||||||
|
// http://stackoverflow.com/q/22715928/15619
|
||||||
|
$publishtime = $articlePage->find('.publish-time', 0)->getAttribute("data-publishtime");
|
||||||
|
// don't know what I'm doing there, but http://www.epochconverter.com/programming/functions-php.php#epoch2date recommends it
|
||||||
|
$item->timestamp = $this->js_to_unix_timestamp($publishtime);
|
||||||
|
$vcard = $articlePage->find('.vcard', 0);
|
||||||
|
if(is_object($vcard)) {
|
||||||
|
$item->name = $vcard->find('a', 0)->innertext;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw new Exception("cache content for ".$item->uri." is NOT a Simple DOM parser object !");
|
||||||
|
}
|
||||||
|
} catch(Exception $e) {
|
||||||
|
$this->message("obtaining ".$item->uri." resulted in exception ".$e->getMessage().". Deleting cached page ...");
|
||||||
|
// maybe file is incorrect. it should be discarded from cache
|
||||||
|
$this->remove_from_cache($item->url);
|
||||||
|
$item->content = $e->getMessage();
|
||||||
|
}
|
||||||
|
$this->items[] = $item;
|
||||||
|
}
|
||||||
|
|
||||||
|
function js_to_unix_timestamp($jsTimestamp){
|
||||||
|
return $jsTimestamp/1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getName(){
|
||||||
|
return $this->name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getURI(){
|
||||||
|
return $this->uri;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getCacheDuration(){
|
||||||
|
return 3600; // 1h
|
||||||
|
}
|
||||||
|
public function getDescription(){
|
||||||
|
return "Gawker press blog content.";
|
||||||
|
}
|
||||||
|
}
|
|
@ -109,17 +109,14 @@ abstract class HttpCachingBridgeAbstract extends BridgeAbstract {
|
||||||
$filename = $filename."index.html";
|
$filename = $filename."index.html";
|
||||||
}
|
}
|
||||||
if(file_exists($filename)) {
|
if(file_exists($filename)) {
|
||||||
// $this->message("loading cached file from ".$filename." for page at url ".$url);
|
// $this->message("loading cached file from ".$filename." for page at url ".$url);
|
||||||
// TODO touch file and its parent, and try to do neighbour deletion
|
// TODO touch file and its parent, and try to do neighbour deletion
|
||||||
$currentPath = $filename;
|
$this->refresh_in_cache($pageCacheDir, $filename);
|
||||||
while(!$pageCacheDir==$currentPath) {
|
|
||||||
touch($currentPath);
|
|
||||||
$currentPath = dirname($currentPath);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// $this->message("we have no local copy of ".$url." Downloading !");
|
// $this->message("we have no local copy of ".$url." Downloading to ".$filename);
|
||||||
$dir = substr($filename, 0, strrpos($filename, '/'));
|
$dir = substr($filename, 0, strrpos($filename, '/'));
|
||||||
if(!is_dir($dir)) {
|
if(!is_dir($dir)) {
|
||||||
|
// $this->message("creating directories for ".$dir);
|
||||||
mkdir($dir, 0777, true);
|
mkdir($dir, 0777, true);
|
||||||
}
|
}
|
||||||
$this->download_remote($url, $filename);
|
$this->download_remote($url, $filename);
|
||||||
|
@ -127,16 +124,40 @@ abstract class HttpCachingBridgeAbstract extends BridgeAbstract {
|
||||||
return file_get_contents($filename);
|
return file_get_contents($filename);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private function refresh_in_cache($pageCacheDir, $filename) {
|
||||||
|
$currentPath = $filename;
|
||||||
|
while(!$pageCacheDir==$currentPath) {
|
||||||
|
touch($currentPath);
|
||||||
|
$currentPath = dirname($currentPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public function download_remote($url , $save_path) {
|
public function download_remote($url , $save_path) {
|
||||||
$f = fopen( $save_path , 'w+');
|
$f = fopen( $save_path , 'w+');
|
||||||
|
if($f) {
|
||||||
$handle = fopen($url , "rb");
|
$handle = fopen($url , "rb");
|
||||||
|
if($handle) {
|
||||||
while (!feof($handle)) {
|
while (!feof($handle)) {
|
||||||
$contents = fread($handle, 8192);
|
$contents = fread($handle, 8192);
|
||||||
|
if($contents) {
|
||||||
fwrite($f , $contents);
|
fwrite($f , $contents);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
fclose($handle);
|
fclose($handle);
|
||||||
|
}
|
||||||
fclose($f);
|
fclose($f);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function remove_from_cache($url) {
|
||||||
|
$simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url);
|
||||||
|
// TODO build this from the variable given to Cache
|
||||||
|
$pageCacheDir = __DIR__ . '/../cache/'."pages/";
|
||||||
|
$filename = realpath($pageCacheDir.$simplified_url);
|
||||||
|
$this->message("removing from cache \"".$filename."\" WELL, NOT REALLY");
|
||||||
|
// filename is NO GOOD
|
||||||
|
// unlink($filename);
|
||||||
|
}
|
||||||
|
|
||||||
public function message($text) {
|
public function message($text) {
|
||||||
$backtrace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS, 3);
|
$backtrace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS, 3);
|
||||||
|
|
Loading…
Reference in a new issue