From 2797336bbeb20afb97e3222de923e839fcba9cb7 Mon Sep 17 00:00:00 2001 From: Sebastien SAUVAGE Date: Wed, 7 Aug 2013 22:33:21 +0200 Subject: [PATCH] Refactoring - object model - code is smaller, more readable. - cache implemented - README updated. - Bridges added: Twitter, Google Search. --- .gitignore | 1 + README.md | 37 ++++++ rss-bridge-flickr-explore.php | 83 ++++--------- rss-bridge-googlesearch.php | 41 +++++++ rss-bridge-lib.php | 214 ++++++++++++++++++++++++++++++++++ rss-bridge-twitter.php | 40 +++++++ 6 files changed, 356 insertions(+), 60 deletions(-) create mode 100644 rss-bridge-googlesearch.php create mode 100644 rss-bridge-lib.php create mode 100644 rss-bridge-twitter.php diff --git a/.gitignore b/.gitignore index 172d97a9..e14b1fbd 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ ## Eclipse ################# simple_html_dom.php +data/ *.pydevproject .project .metadata diff --git a/README.md b/README.md index 5450c313..5ba12f66 100644 --- a/README.md +++ b/README.md @@ -7,17 +7,52 @@ Supported sites/pages === * `rss-bridge-flickr-explore.php` : [Latest interesting images](http://www.flickr.com/explore) from Flickr. + * `rss-bridge-googlesearch.php` : Most recent results from Google Search. Parameters: + * q=keyword : Keyword search. + * `rss-bridge-twitter.php` : Twitter. Parameters: + * q=keyword : Keyword search. + * u=username : Get user timeline. +Output format +=== +Output format can be used in any rss-bridge: + + * `format=atom` (default): ATOM Feed. + * `format=json` : jSon + * `format=html` : html page + * `format=plaintext` : raw text (php object, as returned by print_r) + +If format is not specified, ATOM format will be used. + +Examples +=== + * `rss-bridge-twitter.php?u=Dinnerbone` : Get Dinnerbone (Minecraft developer) timeline, in ATOM format. + * `rss-bridge-twitter.php?q=minecraft&format=html` : Everything Minecraft from Twitter, in html format. + * `rss-bridge-flickr-explore.php` : Latest interesting images from Flickr, in ATOM format. + + Requirements === * php 5.3 * [PHP Simple HTML DOM Parser](http://simplehtmldom.sourceforge.net/) +Author +=== +I'm sebsauvage, webmaster of [sebsauvage.net](http://sebsauvage.net), author of [Shaarli](http://sebsauvage.net/wiki/doku.php?id=php:shaarli) and [ZeroBin](http://sebsauvage.net/wiki/doku.php?id=php:zerobin). + +Thanks to [Mitsukarenai](https://github.com/Mitsukarenai) for the inspiration. + Licence === Code is public domain. + +Technical notes +=== + * There is a cache so that source services won't ban you even if you hammer the rss-bridge with requests. Each bridge has a different duration for the cache. The `cache` subdirectory will be automatically created. You can purge it whenever you want. + * To implement a new rss-bridge, import `rss-bridge-lib.php` and subclass `RssBridgeAbstractClass`. Look at existing bridges for examples. For items you generate in `$this->items`, only `uri` and `title` are mandatory in each item. `timestamp` and `content` are optional but recommended. Any additional key will be ignored by ATOM feed (but outputed to jSon). + Rant === @@ -29,4 +64,6 @@ You're not social when you hamper sharing by removing RSS. You're happy to have We want to share with friends, using open protocols: RSS, XMPP, whatever. Because no one wants to have *your* service with *your* applications using *your* API forced-feeded to them. Friends must be free to choose whatever software and service they want. +We are rebuilding bridges your have wilfully destroyed. + Get your shit together: Put RSS back in. \ No newline at end of file diff --git a/rss-bridge-flickr-explore.php b/rss-bridge-flickr-explore.php index 09f0fef0..e7f153eb 100644 --- a/rss-bridge-flickr-explore.php +++ b/rss-bridge-flickr-explore.php @@ -1,66 +1,29 @@ find('span.photo_container') as $element) +/** + * RssBridgeFlickrExplore + * Returns the newest interesting images from http://www.flickr.com/explore + */ +class RssBridgeFlickrExplore extends RssBridgeAbstractClass { - $item['href'] = 'http://flickr.com'.$element->find('a',0)->href; // Page URI - $item['thumbnailUri'] = $element->find('img',0)->getAttribute('data-defer-src'); // Thumbnail URI - $item['title'] = $element->find('a',0)->title; // Photo title - $items[] = $item; -} - -if(empty($items)) { returnError('404 Not Found', 'ERROR: no results.'); } -$format = 'atom'; -if (!empty($_GET['format'])) { $format = $_GET['format']; } -switch($format) -{ - case 'plaintext': - case 'json': - case 'atom': - break; - default: - $format='atom'; -} - -if($format == 'plaintext') { header('content-type: text/plain;charset=utf8'); print_r($items); exit; } -if($format == 'json') { header('content-type: application/json'); $items = json_encode($items); exit($items); } -if($format == 'atom') -{ - header('content-type: application/atom+xml; charset=UTF-8'); - echo ''."\n"; - echo 'Flickr Explore'."\n"; - echo 'http'.(isset($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 's' : '')."://{$_SERVER['HTTP_HOST']}{$_SERVER['PATH_INFO']}".'/'."\n"; - echo ''.date(DATE_ATOM, $tweets['0']['timestamp']).''."\n"; - echo ''."\n"; - echo ''."\n"."\n"; - - foreach($items as $item) { - echo 'Flickrhttp://flickr.com/'."\n"; - echo '<![CDATA['.$item['title'].']]>'."\n"; - echo ''."\n"; - echo ''.$item['href'].''."\n"; - echo ''."\n"; // FIXME: date ??? - echo ']]>'."\n"; - echo ''."\n\n"; + protected $bridgeName = 'Flickr Explore'; + protected $bridgeURI = 'http://www.flickr.com/explore'; + protected $bridgeDescription = 'Returns the latest interesting images from Flickr'; + protected $cacheDuration = 360; // 6 hours. No need to get more. + protected function collectData($request) { + $html = file_get_html('http://www.flickr.com/explore') or $this->returnError('404 Not Found', 'ERROR: could not request Flickr.'); + $this->items = Array(); + foreach($html->find('span.photo_container') as $element) { + $item['uri'] = 'http://flickr.com'.$element->find('a',0)->href; + $item['thumbnailUri'] = $element->find('img',0)->getAttribute('data-defer-src'); + $item['content'] = ''; // FIXME: Filter javascript ? + $item['title'] = $element->find('a',0)->title; + $this->items[] = $item; } - echo ''; - exit; + } } -exit(); +$bridge = new RssBridgeFlickrExplore(); +$bridge->process(); +?> \ No newline at end of file diff --git a/rss-bridge-googlesearch.php b/rss-bridge-googlesearch.php new file mode 100644 index 00000000..c19a6ca3 --- /dev/null +++ b/rss-bridge-googlesearch.php @@ -0,0 +1,41 @@ +returnError('404 Not Found', 'ERROR: no results for this query.'); + } else { + $this->returnError('400 Bad Request', 'ERROR: You must specify a keyword (?q=...).'); + } + $this->items = Array(); + foreach($html->find('div[id=ires]',0)->find('li[class=g]') as $element) { + $item['uri'] = $element->find('a[href]',0)->href; + $item['title'] = $element->find('h3',0)->plaintext; + $item['content'] = $element->find('span[class=st]',0)->plaintext; + $this->items[] = $item; + } + } +} + +$bridge = new RssBridgeGoogleSearch(); +$bridge->process(); +?> \ No newline at end of file diff --git a/rss-bridge-lib.php b/rss-bridge-lib.php new file mode 100644 index 00000000..b4ff7f40 --- /dev/null +++ b/rss-bridge-lib.php @@ -0,0 +1,214 @@ +'http://foo.bar', 'title'=>'My beautiful foobar', 'content'='Hello, world !','timestamp'=>'1375864834'), + * Array('uri'=>'http://toto.com', 'title'=>'Welcome to toto', 'content'='What is this website about ?','timestamp'=>'1375868313') + * ) + * Keys in dictionnaries: + * uri (string;mandatory) = The URI the item points to. + * title (string;mandatory) = Title of item + * content (string;optionnal) = item content (usually HTML code) + * timestamp (string;optionnal) = item date. Must be in EPOCH format. + * Other keys can be added, but will be ignored. + * $items will be used to build the ATOM feed, json and other outputs. + */ + var $items; + + private $contentType; // MIME type returned to browser. + + /** + * Sets the content-type returns to browser. + * Example: $this->setContentType('text/html; charset=UTF-8') + */ + private function setContentType($value) + { + $this->contentType = $value; + header('Content-Type: '.$value); + } + + /** + * collectData() will be called to ask the bridge to go collect data on the net. + * All derived classes must implement this method. + * This method must fill $this->items with collected items. + * Input: $request : The incoming request (=$_GET). This can be used or ignored by the bridge. + */ + abstract protected function collectData($request); + + /** + * Returns a HTTP error to user, with a message. + * Example: $this->returnError('404 Not Found', 'ERROR: no results.'); + */ + protected function returnError($code, $message) + { + header("HTTP/1.1 $code"); header('Content-Type: text/plain;charset=UTF-8'); + die($message); + } + + /** + * Builds an ATOM feed from $this->items and return it to browser. + */ + private function returnATOM() + { + $this->setContentType('application/atom+xml; charset=UTF-8'); + echo ''."\n"; + echo ''.htmlspecialchars($this->bridgeName).''."\n"; + echo 'http'.(isset($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 's' : '')."://{$_SERVER['HTTP_HOST']}{$_SERVER['PATH_INFO']}".'/'."\n"; + echo ''."\n"; // FIXME + echo ''."\n"; + echo ''."\n"."\n"; + + foreach($this->items as $item) { + echo ''.htmlspecialchars($this->bridgeName).''.htmlspecialchars($this->bridgeURI).''."\n"; + echo '<![CDATA['.$item['title'].']]>'."\n"; + echo ''."\n"; + echo ''.$item['uri'].''."\n"; + if (isset($item['timestamp'])) + { + echo ''.date(DATE_ATOM, $item['timestamp']).''."\n"; + + } + else + { + echo ''."\n"; + } + if (isset($item['content'])) + { + echo ''."\n"; + } + else + { + echo ''."\n"; + } + // FIXME: Security: Disable Javascript ? + echo ''."\n\n"; + } + echo ''; + } + + private function returnHTML() + { + $this->setContentType('text/html; charset=UTF-8'); + echo ''.htmlspecialchars($this->bridgeName).''; + echo ''; + echo '

'.htmlspecialchars($this->bridgeName).'

'; + foreach($this->items as $item) { + echo '

'.htmlspecialchars(strip_tags($item['title'])).'

'; + if (isset($item['timestamp'])) { echo ''.date(DATE_ATOM, $item['timestamp']).''; } + if (isset($item['content'])) { echo '

'.$item['content'].'

'; } + + echo "
\n\n"; + } + echo ''; + } + + /** + * Builds a JSON string from $this->items and return it to browser. + */ + private function returnJSON() + { + $this->setContentType('application/json'); + echo json_encode($this->items); + } + + /** + * Returns $this->items as raw php data. + */ + private function returnPlaintext() + { + $this->setContentType('text/plain;charset=UTF-8'); + print_r($this->items); + } + + /** + * Start processing request and return response to browser. + */ + public function process() + { + $this->serveCachedVersion(); + + // Cache file does not exists or has expired: We re-fetch the results and cache it. + $this->collectData($_GET); + if (empty($this->items)) { $this->returnError('404 Not Found', 'ERROR: no results.'); } + + $format = 'atom'; + if (!empty($_GET['format'])) { $format = $_GET['format']; } + switch($format) { + case 'plaintext': + $this->returnPlaintext(); + break; + case 'json': + $this->returnJSON(); + break; + case 'html': + $this->returnHTML(); + break; + default: + $this->returnATOM(); + } + + $this->storeReponseInCache(); + } + + /** + * Returns the cached version of current request URI directly to the browser + * if it exists and if cache has not expired. + * Continues execution no cached version available. + */ + private function serveCachedVersion() + { + // See if cache exists for this request + $cachefile = CACHEDIR.hash('sha1',$_SERVER['REQUEST_URI']).'.cache'; // Cache path and filename + if (file_exists($cachefile)) { // The cache file exists. + if (time() - ($this->cacheDuration*60) < filemtime($cachefile)) { // Cache file has not expired. Serve it. + $data = json_decode(file_get_contents($cachefile),true); + header('Content-Type: '.$data['Content-Type']); // Send proper MIME Type + header('X-Cached-Version: '.date(DATE_ATOM, filemtime($cachefile))); + echo $data['data']; + exit(); + } + } + } + + /** + * Stores currently generated page in cache. + */ + private function storeReponseInCache() + { + $cachefile = CACHEDIR.hash('sha1',$_SERVER['REQUEST_URI']).'.cache'; // Cache path and filename + $data = Array('data'=>ob_get_contents(), 'Content-Type'=>$this->contentType); + file_put_contents($cachefile,json_encode($data)); + ob_end_flush(); + } +} + +?> \ No newline at end of file diff --git a/rss-bridge-twitter.php b/rss-bridge-twitter.php new file mode 100644 index 00000000..b08ac0d9 --- /dev/null +++ b/rss-bridge-twitter.php @@ -0,0 +1,40 @@ +returnError('404 Not Found', 'ERROR: no results for this query.'); + } elseif (isset($request['u'])) { /* user timeline mode */ + $html = file_get_html('http://twitter.com/'.urlencode($request['u'])) or $this->returnError('404 Not Found', 'ERROR: requested username can\'t be found.'); + } else { + $this->returnError('400 Bad Request', 'ERROR: You must specify a keyword (?q=...) or a Twitter username (?u=...).'); + } + $this->items = Array(); + foreach($html->find('div.tweet') as $tweet) { + $item['username'] = trim(substr($tweet->find('span.username', 0)->plaintext, 1)); // extract username and sanitize + $item['fullname'] = $tweet->getAttribute('data-name'); // extract fullname (pseudonym) + $item['avatar'] = $tweet->find('img', 0)->src; // get avatar link + $item['id'] = $tweet->getAttribute('data-tweet-id'); // get TweetID + $item['uri'] = 'https://twitter.com'.$tweet->find('a.details', 0)->getAttribute('href'); // get tweet link + $item['timestamp'] = $tweet->find('span._timestamp', 0)->getAttribute('data-time'); // extract tweet timestamp + $item['content'] = str_replace('href="/', 'href="https://twitter.com/', strip_tags($tweet->find('p.tweet-text', 0)->innertext, '')); // extract tweet text + $item['title'] = $item['fullname'] . ' (@'.$item['username'] . ') | ' . $item['content']; + $this->items[] = $item; + } + } +} + +$bridge = new RssBridgeTwitter(); +$bridge->process(); +?> \ No newline at end of file