From 4bf90735ef2967b71e47647eb7a147e226794777 Mon Sep 17 00:00:00 2001 From: Sebastien SAUVAGE Date: Mon, 12 Aug 2013 22:37:19 +0200 Subject: [PATCH] Corrections * Corrected GoogleBridge (URI extraction was incorrect) * Corrected ATOM format: * mime-type was incorrect * Hyperlinks were not clickable. * non-UTF8 characters are now properly filtered. * Corrected HTML format output: * Hyperlinks were not clickable. * Corrected error message when SimpleHtmlDom library is not installed. * Added changelog. --- CHANGELOG.md | 21 +++++++++++++++++++++ README.md | 21 ++++++++------------- bridges/GoogleSearchBridge.php | 9 +++++++-- formats/AtomFormat.php | 17 +++++++++++++---- formats/HtmlFormat.php | 7 +++---- lib/Format.php | 17 +++++++++++++++++ lib/RssBridge.php | 2 +- 7 files changed, 70 insertions(+), 24 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..4cab13f3 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,21 @@ +rss-bridge Changelog +=== + +Alpha 0.1 +=== + * Firt tagged version. + * Includes refactoring. + * Unstable. + +Current development version +=== + * Corrected GoogleBridge (URI extraction was incorrect) + * Corrected ATOM format: + * mime-type was incorrect + * Hyperlinks were not clickable. + * non-UTF8 characters are now properly filtered. + * Corrected HTML format output: + * Hyperlinks were not clickable. + * Corrected error message when SimpleHtmlDom library is not installed. + * Added changelog. + \ No newline at end of file diff --git a/README.md b/README.md index 0e6279f8..536d6150 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,6 @@ rss-bridge === -Version alpha 0.1 - rss-bridge is a collection of independant php scripts capable of generating ATOM feed for specific pages which don't have one. Supported sites/pages @@ -10,19 +8,15 @@ Supported sites/pages * `FlickrExplore` : [Latest interesting images](http://www.flickr.com/explore) from Flickr. * `GoogleSearch` : Most recent results from Google Search. Parameters: - * q=keyword : Keyword search. - * `Twitter` : Twitter. Parameters: - * q=keyword : Keyword search. - * u=username : Get user timeline. + * `Twitter` : Twitter. Can return keyword/hashtag search or user timline. -Easy new bridge system (detail below) ! Output format === -Output format can be used in any rss-bridge: +Output format can take several forms: - * `Atom` : ATOM Feed. - * `Json` : Json + * `Atom` : ATOM Feed, for use in RSS/Feed readers + * `Json` : Json, for consumption by other application. * `Html` : html page * `Plaintext` : raw text (php object, as returned by print_r) @@ -35,7 +29,7 @@ Requirements === * php 5.3 - * [PHP Simple HTML DOM Parser](http://simplehtmldom.sourceforge.net). (Put `simple_html_dom.php` in `vendor/simplehtmldom`). + * [PHP Simple HTML DOM Parser](http://simplehtmldom.sourceforge.net). (Put `simple_html_dom.php` in `vendor/simplehtmldom/`). * Ssl lib activated in PHP config @@ -46,7 +40,8 @@ I'm sebsauvage, webmaster of [sebsauvage.net](http://sebsauvage.net), author of Thanks to [Mitsukarenai](https://github.com/Mitsukarenai) for the inspiration. Patch : -- Yves ASTIER (Draeli) : PHP optimizations, fixes, dynamic brigde/format list with all stuff behind and extend cache system. Mail : contact@yves-astier.com + + * Yves ASTIER ([Draeli](https://github.com/Draeli)) : PHP optimizations, fixes, dynamic brigde/format list with all stuff behind and extend cache system. Mail : contact@yves-astier.com Licence === @@ -56,7 +51,7 @@ Code is public domain. Technical notes === * There is a cache so that source services won't ban you even if you hammer the rss-bridge with requests. Each bridge has a different duration for the cache. The `cache` subdirectory will be automatically created. You can purge it whenever you want. - * To implement a new rss-bridge, create a new class in `bridges` directory and extends with `BridgeAbstract`. Look at existing bridges for examples. For items you generate in `$this->items`, only `uri` and `title` are mandatory in each item. `timestamp` and `content` are optional but recommended. Any additional key will be ignored by ATOM feed (but outputed to jSon). If you want your new bridge appear in `index.php`, don't forget add annotation. + * To implement a new rss-bridge, create a new class in `bridges` subdirectory. Look at existing bridges for examples. For items you generate in `$this->items`, only `uri` and `title` are mandatory in each item. `timestamp` and `content` are optional but recommended. Any additional key will be ignored by ATOM feed (but outputed to jSon). Rant === diff --git a/bridges/GoogleSearchBridge.php b/bridges/GoogleSearchBridge.php index dcdd01de..45694bae 100644 --- a/bridges/GoogleSearchBridge.php +++ b/bridges/GoogleSearchBridge.php @@ -28,8 +28,13 @@ class GoogleSearchBridge extends BridgeAbstract{ $emIsRes = $html->find('div[id=ires]',0); if( !is_null($emIsRes) ){ foreach($emIsRes->find('li[class=g]') as $element) { - $item = new \Item(); - $item->uri = $element->find('a[href]',0)->href; + $item = new Item(); + + // Extract direct URL from google href (eg. /url?q=...) + $t = $element->find('a[href]',0)->href; + $item->uri = 'http://google.com'.$t; + parse_str(parse_url($t, PHP_URL_QUERY),$parameters); + if (isset($parameters['q'])) { $item->uri = $parameters['q']; } $item->title = $element->find('h3',0)->plaintext; $item->content = $element->find('span[class=st]',0)->plaintext; $this->items[] = $item; diff --git a/formats/AtomFormat.php b/formats/AtomFormat.php index 2df9090c..cdd0a9ad 100644 --- a/formats/AtomFormat.php +++ b/formats/AtomFormat.php @@ -26,7 +26,8 @@ class AtomFormat extends FormatAbstract{ $entryTitle = is_null($data->title) ? '' : $data->title; $entryUri = is_null($data->uri) ? '' : $data->uri; $entryTimestamp = is_null($data->timestamp) ? '' : date(DATE_ATOM, $data->timestamp); - $entryContent = is_null($data->content) ? '' : 'content) . ']]>'; + // We prevent content from closing the CDATA too early. + $entryContent = is_null($data->content) ? '' : 'sanitizeHtml(str_replace(']]>','',$data->content)) . ']]>'; $entries .= << EOD; + + // Remove invalid non-UTF8 characters + // We cannot use iconv because of a bug in some versions of iconv. + // See http://www.php.net/manual/fr/function.iconv.php#108643 + //$toReturn = iconv("UTF-8", "UTF-8//IGNORE", $toReturn); + // So we use mb_convert_encoding instead: + ini_set('mbstring.substitute_character', 'none'); + $toReturn= mb_convert_encoding($toReturn, 'UTF-8', 'UTF-8'); return $toReturn; } public function display(){ - // $this - // ->setContentType('application/atom+xml; charset=' . $this->getCharset()) - // ->callContentType(); + $this + ->setContentType('application/atom+xml; charset=utf8') // We force UTF-8 in ATOM output. + ->callContentType(); return parent::display(); } diff --git a/formats/HtmlFormat.php b/formats/HtmlFormat.php index 86c61737..c52cf963 100644 --- a/formats/HtmlFormat.php +++ b/formats/HtmlFormat.php @@ -16,10 +16,9 @@ class HtmlFormat extends FormatAbstract{ $entries = ''; foreach($this->getDatas() as $data){ $entryUri = is_null($data->uri) ? $uri : $data->uri; - $entryTitle = is_null($data->title) ? '' : htmlspecialchars(strip_tags($data->title)); + $entryTitle = is_null($data->title) ? '' : $this->sanitizeHtml(strip_tags($data->title)); $entryTimestamp = is_null($data->timestamp) ? '' : '' . date(DATE_ATOM, $data->timestamp) . ''; - $entryContent = is_null($data->content) ? '' : '

' . $data->content . '

'; - + $entryContent = is_null($data->content) ? '' : '

' . $this->sanitizeHtml($data->content). '

'; $entries .= << @@ -52,7 +51,7 @@ EOD; return $toReturn; } - public function display(){ + public function display() { $this ->setContentType('text/html; charset=' . $this->getCharset()) ->callContentType(); diff --git a/lib/Format.php b/lib/Format.php index 70c8d7ab..a2ede6d0 100644 --- a/lib/Format.php +++ b/lib/Format.php @@ -90,6 +90,23 @@ abstract class FormatAbstract implements FormatInterface{ return $this->extraInfos; } + + /** + * Sanitized html while leaving it functionnal. + * The aim is to keep html as-is (with clickable hyperlinks) + * while reducing annoying and potentially dangerous things. + * Yes, I know sanitizing HTML 100% is an impossible task. + * Maybe we'll switch to http://htmlpurifier.org/ + * or http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/index.php + */ + public function sanitizeHtml($html) + { + $html = str_replace('