From 2797336bbeb20afb97e3222de923e839fcba9cb7 Mon Sep 17 00:00:00 2001
From: Sebastien SAUVAGE <sebsauvage@sebsauvage.net>
Date: Wed, 7 Aug 2013 22:33:21 +0200
Subject: [PATCH] Refactoring

- object model
- code is smaller, more readable.
- cache implemented
- README updated.
- Bridges added: Twitter, Google Search.
---
 .gitignore                    |   1 +
 README.md                     |  37 ++++++
 rss-bridge-flickr-explore.php |  83 ++++---------
 rss-bridge-googlesearch.php   |  41 +++++++
 rss-bridge-lib.php            | 214 ++++++++++++++++++++++++++++++++++
 rss-bridge-twitter.php        |  40 +++++++
 6 files changed, 356 insertions(+), 60 deletions(-)
 create mode 100644 rss-bridge-googlesearch.php
 create mode 100644 rss-bridge-lib.php
 create mode 100644 rss-bridge-twitter.php

diff --git a/.gitignore b/.gitignore
index 172d97a9..e14b1fbd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 ## Eclipse
 #################
 simple_html_dom.php
+data/
 *.pydevproject
 .project
 .metadata
diff --git a/README.md b/README.md
index 5450c313..5ba12f66 100644
--- a/README.md
+++ b/README.md
@@ -7,17 +7,52 @@ Supported sites/pages
 ===
 
  * `rss-bridge-flickr-explore.php` : [Latest interesting images](http://www.flickr.com/explore) from Flickr.
+ * `rss-bridge-googlesearch.php` : Most recent results from Google Search. Parameters:
+   * q=keyword : Keyword search.
+ * `rss-bridge-twitter.php` : Twitter. Parameters:
+   * q=keyword : Keyword search.
+   * u=username : Get user timeline.
 
+Output format
+===
+Output format can be used in any rss-bridge:
+
+ * `format=atom` (default): ATOM Feed.
+ * `format=json` : jSon
+ * `format=html` : html page
+ * `format=plaintext` : raw text (php object, as returned by print_r)
+
+If format is not specified, ATOM format will be used.
+
+Examples
+===
+ * `rss-bridge-twitter.php?u=Dinnerbone` : Get Dinnerbone (Minecraft developer) timeline, in ATOM format.
+ * `rss-bridge-twitter.php?q=minecraft&format=html` : Everything Minecraft from Twitter, in html format.
+ * `rss-bridge-flickr-explore.php` : Latest interesting images from Flickr, in ATOM format.
+
+   
 Requirements
 ===
 
  * php 5.3
  * [PHP Simple HTML DOM Parser](http://simplehtmldom.sourceforge.net/)
  
+Author
+===
+I'm sebsauvage, webmaster of [sebsauvage.net](http://sebsauvage.net), author of [Shaarli](http://sebsauvage.net/wiki/doku.php?id=php:shaarli) and [ZeroBin](http://sebsauvage.net/wiki/doku.php?id=php:zerobin).
+
+Thanks to [Mitsukarenai](https://github.com/Mitsukarenai) for the inspiration.
+
 Licence
 ===
 Code is public domain.
 
+
+Technical notes
+===
+  * There is a cache so that source services won't ban you even if you hammer the rss-bridge with requests. Each bridge has a different duration for the cache. The `cache` subdirectory will be automatically created. You can purge it whenever you want.
+  * To implement a new rss-bridge, import `rss-bridge-lib.php` and subclass `RssBridgeAbstractClass`. Look at existing bridges for examples. For items you generate in `$this->items`, only `uri` and `title` are mandatory in each item. `timestamp` and `content` are optional but recommended. Any additional key will be ignored by ATOM feed (but outputed to jSon).
+
 Rant
 ===
 
@@ -29,4 +64,6 @@ You're not social when you hamper sharing by removing RSS. You're happy to have
 
 We want to share with friends, using open protocols: RSS, XMPP, whatever. Because no one wants to have *your* service with *your* applications using *your* API forced-feeded to them. Friends must be free to choose whatever software and service they want.
 
+We are rebuilding bridges your have wilfully destroyed.
+
 Get your shit together: Put RSS back in.
\ No newline at end of file
diff --git a/rss-bridge-flickr-explore.php b/rss-bridge-flickr-explore.php
index 09f0fef0..e7f153eb 100644
--- a/rss-bridge-flickr-explore.php
+++ b/rss-bridge-flickr-explore.php
@@ -1,66 +1,29 @@
 <?php
-/* Flickr Explorer RSS bridge.
-   Returns a feed all new interesting images from http://www.flickr.com/explore
-   Licence: Public domain.
-   Returns ATOM feed by default.
-   Other available formats: ?format=plaintext and ?format=json
-*/
-ini_set('user_agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:20.0) Gecko/20100101 Firefox/20.0');
-date_default_timezone_set('UTC');
-//ini_set('display_errors','1');
-//error_reporting(E_ALL);
+require_once('rss-bridge-lib.php');
 
-function returnError($code, $message) { header("HTTP/1.1 $code"); header('content-type: text/plain'); die($message); }
-
-if (!file_exists('simple_html_dom.php')) { returnError('404 Not Found', 'ERROR: "PHP Simple HTML DOM Parser" is missing. Get it from http://simplehtmldom.sourceforge.net/  and place the script "simple_html_dom.php" in the same folder to allow me to work.'); }
-require_once('simple_html_dom.php');
-
-$html = file_get_html('http://www.flickr.com/explore') or returnError('404 Not Found', 'ERROR: could not request Flickr');
-$items = Array();
-foreach($html->find('span.photo_container') as $element) 
+/**
+ * RssBridgeFlickrExplore 
+ * Returns the newest interesting images from http://www.flickr.com/explore
+ */
+class RssBridgeFlickrExplore extends RssBridgeAbstractClass
 {
-    $item['href'] = 'http://flickr.com'.$element->find('a',0)->href;  // Page URI
-    $item['thumbnailUri'] = $element->find('img',0)->getAttribute('data-defer-src');  // Thumbnail URI
-    $item['title'] = $element->find('a',0)->title;  // Photo title
-    $items[] = $item;
-}
-
-if(empty($items)) { returnError('404 Not Found', 'ERROR: no results.'); }
-$format = 'atom';
-if (!empty($_GET['format'])) { $format = $_GET['format']; }
-switch($format) 
-{
-    case 'plaintext':
-    case 'json':
-    case 'atom':
-        break;
-    default:
-        $format='atom';
-}
-
-if($format == 'plaintext') { header('content-type: text/plain;charset=utf8'); print_r($items); exit; }
-if($format == 'json') { header('content-type: application/json'); $items = json_encode($items); exit($items); }
-if($format == 'atom') 
-{
-    header('content-type: application/atom+xml; charset=UTF-8');
-    echo '<?xml version="1.0" encoding="UTF-8"?><feed xmlns="http://www.w3.org/2005/Atom" xmlns:thr="http://purl.org/syndication/thread/1.0" xml:lang="en-US">'."\n";
-    echo '<title type="text">Flickr Explore</title>'."\n";
-    echo '<id>http'.(isset($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 's' : '')."://{$_SERVER['HTTP_HOST']}{$_SERVER['PATH_INFO']}".'/</id>'."\n";
-    echo '<updated>'.date(DATE_ATOM, $tweets['0']['timestamp']).'</updated>'."\n";
-    echo '<link rel="alternate" type="text/html" href="http://www.flickr.com/explore" />'."\n";
-    echo '<link rel="self" href="http'.(isset($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 's' : '')."://{$_SERVER['HTTP_HOST']}".htmlentities($_SERVER['REQUEST_URI']).'" />'."\n"."\n";
-
-    foreach($items as $item) {
-        echo '<entry><author><name>Flickr</name><uri>http://flickr.com/</uri></author>'."\n";
-        echo '<title type="html"><![CDATA['.$item['title'].']]></title>'."\n";
-        echo '<link rel="alternate" type="text/html" href="'.$item['href'].'" />'."\n";
-        echo '<id>'.$item['href'].'</id>'."\n";
-        echo '<updated></updated>'."\n"; // FIXME: date ???
-        echo '<content type="html"><![CDATA[<a href="'.$item['href'].'"><img src="'.$item['thumbnailUri'].'" /></a>]]></content>'."\n";
-        echo '</entry>'."\n\n";
+    protected $bridgeName = 'Flickr Explore';
+    protected $bridgeURI = 'http://www.flickr.com/explore';
+    protected $bridgeDescription = 'Returns the latest interesting images from Flickr';
+    protected $cacheDuration = 360;  // 6 hours. No need to get more.
+    protected function collectData($request) {
+        $html = file_get_html('http://www.flickr.com/explore') or $this->returnError('404 Not Found', 'ERROR: could not request Flickr.');
+        $this->items = Array();
+        foreach($html->find('span.photo_container') as $element) {
+            $item['uri'] = 'http://flickr.com'.$element->find('a',0)->href;
+            $item['thumbnailUri'] = $element->find('img',0)->getAttribute('data-defer-src');
+            $item['content'] = '<a href="'.$item['uri'].'"><img src="'.$item['thumbnailUri'].'" /></a>'; // FIXME: Filter javascript ?
+            $item['title'] = $element->find('a',0)->title;
+            $this->items[] = $item;
         }
-    echo '</feed>';
-    exit;
+    }
 }
 
-exit();
+$bridge = new RssBridgeFlickrExplore();
+$bridge->process();
+?>
\ No newline at end of file
diff --git a/rss-bridge-googlesearch.php b/rss-bridge-googlesearch.php
new file mode 100644
index 00000000..c19a6ca3
--- /dev/null
+++ b/rss-bridge-googlesearch.php
@@ -0,0 +1,41 @@
+<?php
+require_once('rss-bridge-lib.php');
+
+/**
+ * RssBridgeGoogleMostRecent
+ * Search Google for most recent pages regarding a specific topic.
+ * Returns the 100 most recent links in results in past year,
+ * sorting by date (most recent first).
+ * Example:
+ * http://www.google.com/search?q=sebsauvage&num=100&complete=0&tbs=qdr:y,sbd:1
+ *    complete=0&num=100 : get 100 results
+ *    qdr:y : in past year
+ *    sbd:1 : sort by date (will only work if qdr: is specified)
+ */
+ 
+class RssBridgeGoogleSearch extends RssBridgeAbstractClass
+{
+    protected $bridgeName = 'Google search';
+    protected $bridgeURI = 'http://google.com';
+    protected $bridgeDescription = 'Returns most recent results from Google search.';
+    protected $cacheDuration = 30; // 30 minutes, otherwise you could get banned by Google, or stumblr upon their captcha.
+    protected function collectData($request) {
+        $html = '';
+        if (isset($request['q'])) {   /* keyword search mode */
+            $html = file_get_html('http://www.google.com/search?q='.urlencode($request['q']).'&num=100&complete=0&tbs=qdr:y,sbd:1') or $this->returnError('404 Not Found', 'ERROR: no results for this query.');
+        } else {
+            $this->returnError('400 Bad Request', 'ERROR: You must specify a keyword (?q=...).');
+        }
+        $this->items = Array();
+        foreach($html->find('div[id=ires]',0)->find('li[class=g]') as $element) {
+            $item['uri'] = $element->find('a[href]',0)->href;
+            $item['title'] = $element->find('h3',0)->plaintext;
+            $item['content'] = $element->find('span[class=st]',0)->plaintext;
+            $this->items[] = $item;
+        }
+    }
+} 
+
+$bridge = new RssBridgeGoogleSearch();
+$bridge->process();
+?>
\ No newline at end of file
diff --git a/rss-bridge-lib.php b/rss-bridge-lib.php
new file mode 100644
index 00000000..b4ff7f40
--- /dev/null
+++ b/rss-bridge-lib.php
@@ -0,0 +1,214 @@
+<?php
+/* rss-bridge library.
+   Foundation functions for rss-bridge project.
+   See https://github.com/sebsauvage/rss-bridge
+   Licence: Public domain.
+*/
+ini_set('user_agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:20.0) Gecko/20100101 Firefox/20.0');
+date_default_timezone_set('UTC');
+error_reporting(0);
+//ini_set('display_errors','1'); error_reporting(E_ALL);  // For debugging only.
+define('CACHEDIR','cache/');   // Directory containing cache files. Do not forget trailing slash.
+ob_start(); 
+
+// Create cache directory if it does not exist.
+if (!is_dir(CACHEDIR)) { mkdir(CACHEDIR,0705); chmod(CACHEDIR,0705); }
+
+// Import DOM library.
+if (!file_exists('simple_html_dom.php')) 
+{ 
+    header('HTTP/1.1 500 Internal Server Error'); 
+    header('Content-Type: text/plain'); 
+    die('"PHP Simple HTML DOM Parser" is missing. Get it from http://simplehtmldom.sourceforge.net/ and place the script "simple_html_dom.php" in the same folder to allow me to work.'); 
+}
+require_once('simple_html_dom.php');
+
+/**
+ * Abstract RSSBridge class on which all bridges are build upon.
+ * It provides utility methods (cache, ATOM feed building...)
+ */
+abstract class RssBridgeAbstractClass 
+{
+    /**
+     * $items is an array of dictionnaries. Each subclass must fill this array when collectData() is called.
+     * eg. $items = Array(   Array('uri'=>'http://foo.bar', 'title'=>'My beautiful foobar', 'content'='Hello, <b>world !</b>','timestamp'=>'1375864834'),
+     *                       Array('uri'=>'http://toto.com', 'title'=>'Welcome to toto', 'content'='What is this website about ?','timestamp'=>'1375868313')
+     *                   )
+     * Keys in dictionnaries:
+     *    uri (string;mandatory) = The URI the item points to.
+     *    title (string;mandatory) = Title of item
+     *    content (string;optionnal) = item content (usually HTML code)
+     *    timestamp (string;optionnal) = item date. Must be in EPOCH format.
+     *    Other keys can be added, but will be ignored.
+     * $items will be used to build the ATOM feed, json and other outputs.
+     */
+    var $items;
+    
+    private $contentType;  // MIME type returned to browser.
+    
+    /**
+     * Sets the content-type returns to browser.
+     * Example: $this->setContentType('text/html; charset=UTF-8')
+     */
+    private function setContentType($value)
+    {
+        $this->contentType = $value;
+        header('Content-Type: '.$value);
+    }
+    
+    /**
+     * collectData() will be called to ask the bridge to go collect data on the net.
+     * All derived classes must implement this method.
+     * This method must fill $this->items with collected items.
+     * Input: $request : The incoming request (=$_GET). This can be used or ignored by the bridge.
+     */
+    abstract protected function collectData($request);
+
+    /**
+     * Returns a HTTP error to user, with a message.
+     * Example: $this->returnError('404 Not Found', 'ERROR: no results.');
+     */
+    protected function returnError($code, $message)
+    { 
+        header("HTTP/1.1 $code"); header('Content-Type: text/plain;charset=UTF-8');
+        die($message); 
+    }
+    
+    /**
+     * Builds an ATOM feed from $this->items and return it to browser.
+     */
+    private function returnATOM()
+    {
+        $this->setContentType('application/atom+xml; charset=UTF-8');
+        echo '<?xml version="1.0" encoding="UTF-8"?><feed xmlns="http://www.w3.org/2005/Atom" xmlns:thr="http://purl.org/syndication/thread/1.0" xml:lang="en-US">'."\n";
+        echo '<title type="text">'.htmlspecialchars($this->bridgeName).'</title>'."\n";
+        echo '<id>http'.(isset($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 's' : '')."://{$_SERVER['HTTP_HOST']}{$_SERVER['PATH_INFO']}".'/</id>'."\n";
+        echo '<updated></updated>'."\n"; // FIXME
+        echo '<link rel="alternate" type="text/html" href="'.htmlspecialchars($this->bridgeURI).'" />'."\n";
+        echo '<link rel="self" href="http'.(isset($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 's' : '')."://{$_SERVER['HTTP_HOST']}".htmlentities($_SERVER['REQUEST_URI']).'" />'."\n"."\n";
+
+        foreach($this->items as $item) {
+            echo '<entry><author><name>'.htmlspecialchars($this->bridgeName).'</name><uri>'.htmlspecialchars($this->bridgeURI).'</uri></author>'."\n";
+            echo '<title type="html"><![CDATA['.$item['title'].']]></title>'."\n";
+            echo '<link rel="alternate" type="text/html" href="'.$item['uri'].'" />'."\n";
+            echo '<id>'.$item['uri'].'</id>'."\n";
+            if (isset($item['timestamp']))
+            {
+                echo '<updated>'.date(DATE_ATOM, $item['timestamp']).'</updated>'."\n";
+                
+            }
+            else
+            {
+                echo '<updated></updated>'."\n";
+            }
+            if (isset($item['content']))
+            {
+                echo '<content type="html"><![CDATA['.$item['content'].']]></content>'."\n";
+            }
+            else
+            {
+                echo '<content type="html"></content>'."\n";
+            }
+            // FIXME: Security: Disable Javascript ?
+            echo '</entry>'."\n\n";
+            }
+        echo '</feed>';    
+    }
+    
+    private function returnHTML()
+    {
+        $this->setContentType('text/html; charset=UTF-8');
+        echo '<html><head><title>'.htmlspecialchars($this->bridgeName).'</title>';
+        echo '<style>body{font-family:"Trebuchet MS",Verdana,Arial,Helvetica,sans-serif;font-size:10pt;background-color:#aaa;}div.rssitem{border:1px solid black;padding:5px;margin:10px;background-color:#fff;}</style></head><body>';
+        echo '<h1>'.htmlspecialchars($this->bridgeName).'</h1>';
+        foreach($this->items as $item) {
+            echo '<div class="rssitem"><h2><a href="'.$item['uri'].'">'.htmlspecialchars(strip_tags($item['title'])).'</a></h2>';
+            if (isset($item['timestamp'])) { echo '<small>'.date(DATE_ATOM, $item['timestamp']).'</small>'; }
+            if (isset($item['content'])) { echo '<p>'.$item['content'].'</p>'; }
+
+            echo "</div>\n\n";
+        }
+        echo '</body></html>';
+    }
+    
+    /**
+     * Builds a JSON string from $this->items and return it to browser.
+     */   
+    private function returnJSON()
+    {
+        $this->setContentType('application/json'); 
+        echo json_encode($this->items);
+    }
+    
+    /**
+     * Returns $this->items as raw php data.
+     */
+    private function returnPlaintext()
+    {
+        $this->setContentType('text/plain;charset=UTF-8'); 
+        print_r($this->items); 
+    }
+    
+    /**
+     * Start processing request and return response to browser.
+     */
+    public function process()
+    {
+        $this->serveCachedVersion();
+
+        // Cache file does not exists or has expired: We re-fetch the results and cache it.
+        $this->collectData($_GET);
+        if (empty($this->items)) { $this->returnError('404 Not Found', 'ERROR: no results.'); }
+
+        $format = 'atom';
+        if (!empty($_GET['format'])) { $format = $_GET['format']; }
+        switch($format) {
+            case 'plaintext':
+                $this->returnPlaintext();
+                break;
+            case 'json':
+                $this->returnJSON();
+                break;               
+            case 'html':
+                $this->returnHTML();
+                break;              
+            default:
+                $this->returnATOM();
+        }
+        
+        $this->storeReponseInCache();
+    }
+
+    /**
+     * Returns the cached version of current request URI directly to the browser
+     * if it exists and if cache has not expired.
+     * Continues execution no cached version available.
+     */
+    private function serveCachedVersion()
+    {
+        // See if cache exists for this request
+        $cachefile = CACHEDIR.hash('sha1',$_SERVER['REQUEST_URI']).'.cache'; // Cache path and filename
+        if (file_exists($cachefile)) { // The cache file exists.
+            if (time() - ($this->cacheDuration*60) < filemtime($cachefile)) { // Cache file has not expired. Serve it.
+                $data = json_decode(file_get_contents($cachefile),true);
+                header('Content-Type: '.$data['Content-Type']); // Send proper MIME Type
+                header('X-Cached-Version: '.date(DATE_ATOM, filemtime($cachefile)));
+                echo $data['data'];
+                exit();
+            }
+        }     
+    }
+    
+    /**
+     * Stores currently generated page in cache.
+     */
+    private function storeReponseInCache()
+    {
+        $cachefile = CACHEDIR.hash('sha1',$_SERVER['REQUEST_URI']).'.cache'; // Cache path and filename
+        $data = Array('data'=>ob_get_contents(), 'Content-Type'=>$this->contentType);
+        file_put_contents($cachefile,json_encode($data));
+        ob_end_flush();
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/rss-bridge-twitter.php b/rss-bridge-twitter.php
new file mode 100644
index 00000000..b08ac0d9
--- /dev/null
+++ b/rss-bridge-twitter.php
@@ -0,0 +1,40 @@
+<?php
+require_once('rss-bridge-lib.php');
+
+/**
+ * RssBridgeTwitter 
+ * Based on https://github.com/mitsukarenai/twitterbridge-noapi
+ */
+class RssBridgeTwitter extends RssBridgeAbstractClass
+{
+    protected $bridgeName = 'Twitter Bridge';
+    protected $bridgeURI = 'http://twitter.com';
+    protected $bridgeDescription = 'Returns user timelines or keyword search from http://twitter.com without using their API.';
+    protected $cacheDuration = 5; // 5 minutes
+    protected function collectData($request) {
+        $html = '';
+        if (isset($request['q'])) {   /* keyword search mode */
+            $html = file_get_html('http://twitter.com/search/realtime?q='.urlencode($request['q']).'+include:retweets&src=typd') or $this->returnError('404 Not Found', 'ERROR: no results for this query.');
+        } elseif (isset($request['u'])) {   /* user timeline mode */
+            $html = file_get_html('http://twitter.com/'.urlencode($request['u'])) or $this->returnError('404 Not Found', 'ERROR: requested username can\'t be found.');
+        } else {
+            $this->returnError('400 Bad Request', 'ERROR: You must specify a keyword (?q=...) or a Twitter username (?u=...).');
+        }
+        $this->items = Array();
+        foreach($html->find('div.tweet') as $tweet) {
+            $item['username'] = trim(substr($tweet->find('span.username', 0)->plaintext, 1));	// extract username and sanitize
+            $item['fullname'] = $tweet->getAttribute('data-name'); // extract fullname (pseudonym)
+            $item['avatar']	= $tweet->find('img', 0)->src;	// get avatar link
+            $item['id']	= $tweet->getAttribute('data-tweet-id');	// get TweetID
+            $item['uri'] = 'https://twitter.com'.$tweet->find('a.details', 0)->getAttribute('href');	// get tweet link
+            $item['timestamp']	= $tweet->find('span._timestamp', 0)->getAttribute('data-time');	// extract tweet timestamp
+            $item['content'] = str_replace('href="/', 'href="https://twitter.com/', strip_tags($tweet->find('p.tweet-text', 0)->innertext, '<a>'));	// extract tweet text
+            $item['title'] = $item['fullname'] . ' (@'.$item['username'] . ') | ' . $item['content'];
+            $this->items[] = $item;
+        }
+    }
+} 
+
+$bridge = new RssBridgeTwitter();
+$bridge->process();
+?>
\ No newline at end of file