From bc1ef5b94a711a0db249f1773db9b3ca1da31c6c Mon Sep 17 00:00:00 2001 From: Alexis J Date: Wed, 4 Mar 2015 18:02:47 +0100 Subject: [PATCH 1/4] Add some filters to clean URLs --- index.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/index.php b/index.php index 99c3765..d714040 100644 --- a/index.php +++ b/index.php @@ -1646,6 +1646,11 @@ function renderPage() $i=strpos($url,'&utm_source='); if ($i!==false) $url=substr($url,0,$i); $i=strpos($url,'?utm_source='); if ($i!==false) $url=substr($url,0,$i); $i=strpos($url,'#xtor=RSS-'); if ($i!==false) $url=substr($url,0,$i); + $i=strpos($url,'?fb_'); if ($i!==false) $url=substr($url,0,$i); + $i=strpos($url,'?__scoop'); if ($i!==false) $url=substr($url,0,$i); + $i=strpos($url,'#tk.rss_all?'); if ($i!==false) $url=substr($url,0,$i); + $i=strpos($url,'?utm_campaign='); if ($i!==false) $url=substr($url,0,$i); + $i=strpos($url,'?utm_medium='); if ($i!==false) $url=substr($url,0,$i); $link_is_new = false; $link = $LINKSDB->getLinkFromUrl($url); // Check if URL is not already in database (in this case, we will edit the existing link) From ad2a397c66a3da8061564602b43db6f2002f0064 Mon Sep 17 00:00:00 2001 From: nodiscc Date: Wed, 4 Mar 2015 19:52:24 +0100 Subject: [PATCH 2/4] cleanup: refactor annoying URL patterns in a single loop * fixes https://github.com/shaarli/Shaarli/issues/133 --- index.php | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/index.php b/index.php index d714040..cc3d736 100644 --- a/index.php +++ b/index.php @@ -1642,15 +1642,12 @@ function renderPage() { $url=$_GET['post']; - // We remove the annoying parameters added by FeedBurner and GoogleFeedProxy (?utm_source=...) - $i=strpos($url,'&utm_source='); if ($i!==false) $url=substr($url,0,$i); - $i=strpos($url,'?utm_source='); if ($i!==false) $url=substr($url,0,$i); - $i=strpos($url,'#xtor=RSS-'); if ($i!==false) $url=substr($url,0,$i); - $i=strpos($url,'?fb_'); if ($i!==false) $url=substr($url,0,$i); - $i=strpos($url,'?__scoop'); if ($i!==false) $url=substr($url,0,$i); - $i=strpos($url,'#tk.rss_all?'); if ($i!==false) $url=substr($url,0,$i); - $i=strpos($url,'?utm_campaign='); if ($i!==false) $url=substr($url,0,$i); - $i=strpos($url,'?utm_medium='); if ($i!==false) $url=substr($url,0,$i); + // We remove the annoying parameters added by FeedBurner, GoogleFeedProxy, Facebook... + $annoyingpatterns = array('&utm_source=', '?utm_source=', '#xtor=RSS-', '?fb_', '?__scoop', '#tk.rss_all?', '?utm_campaign=', '?utm_medium='); + foreach($annoyingpatterns as $pattern) + { + $i=strpos($url,$pattern); if ($i!==false) $url=substr($url,0,$i); + } $link_is_new = false; $link = $LINKSDB->getLinkFromUrl($url); // Check if URL is not already in database (in this case, we will edit the existing link) From 403a19940961eaf3edae84c7e9c4fa0bd074e940 Mon Sep 17 00:00:00 2001 From: nodiscc Date: Thu, 5 Mar 2015 13:33:30 +0100 Subject: [PATCH 3/4] Improve annoying URL parameters cleaning: * Use regular expressions to avoid suplicating params depending on their position in the URL (¶m=,?param=) * Only remove the relevant URL pattern and don't remove following params, fixes https://github.com/shaarli/Shaarli/issues/136 * Credits to Marcus Rohrmoser (https://github.com/mro) --- index.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/index.php b/index.php index cc3d736..0b507b4 100644 --- a/index.php +++ b/index.php @@ -1642,11 +1642,12 @@ function renderPage() { $url=$_GET['post']; + // We remove the annoying parameters added by FeedBurner, GoogleFeedProxy, Facebook... - $annoyingpatterns = array('&utm_source=', '?utm_source=', '#xtor=RSS-', '?fb_', '?__scoop', '#tk.rss_all?', '?utm_campaign=', '?utm_medium='); + $annoyingpatterns = array('/[\?&]utm_source=[^&]*/', '/[\?&]utm_campaign=[^&]*/', '/[\?&]utm_medium=[^&]*/', '/#xtor=RSS-[^&]*/', '/[\?&]fb_[^&]*/', '/[\?&]__scoop[^&]*/', '/#tk\.rss_all\?/'); foreach($annoyingpatterns as $pattern) { - $i=strpos($url,$pattern); if ($i!==false) $url=substr($url,0,$i); + $url = preg_replace($pattern, "", $url); } $link_is_new = false; From baf5cbf27d18467d838a24b6f451036cebaa27bf Mon Sep 17 00:00:00 2001 From: nodiscc Date: Thu, 5 Mar 2015 13:40:43 +0100 Subject: [PATCH 4/4] Improve URL cleaning: * also remove action_type_map, action_ref_map and action_object maps params used by facebook --- index.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index.php b/index.php index 0b507b4..bc4fa1e 100644 --- a/index.php +++ b/index.php @@ -1644,7 +1644,7 @@ function renderPage() // We remove the annoying parameters added by FeedBurner, GoogleFeedProxy, Facebook... - $annoyingpatterns = array('/[\?&]utm_source=[^&]*/', '/[\?&]utm_campaign=[^&]*/', '/[\?&]utm_medium=[^&]*/', '/#xtor=RSS-[^&]*/', '/[\?&]fb_[^&]*/', '/[\?&]__scoop[^&]*/', '/#tk\.rss_all\?/'); + $annoyingpatterns = array('/[\?&]utm_source=[^&]*/', '/[\?&]utm_campaign=[^&]*/', '/[\?&]utm_medium=[^&]*/', '/#xtor=RSS-[^&]*/', '/[\?&]fb_[^&]*/', '/[\?&]__scoop[^&]*/', '/#tk\.rss_all\?/', '/[\?&]action_ref_map=[^&]*/', '/[\?&]action_type_map=[^&]*/', '/[\?&]action_object_map=[^&]*/'); foreach($annoyingpatterns as $pattern) { $url = preg_replace($pattern, "", $url);