From a973afeac7b7399d35b881920f0afc1947765ccd Mon Sep 17 00:00:00 2001 From: VirtualTam Date: Thu, 28 Jul 2016 22:54:33 +0200 Subject: [PATCH] Refactor bookmark import using a generic Netscape parser Relates to #607 Relates to #608 Relates to #493 (abandoned) Additions: - use Composer's autoload to load 3rd-party dependencies under vendor/ Modifications: - [import] replace the current parser with a generic, stable parser - move code to application/NetscapeBookmarkUtils - improve status report after parsing - [router] use the same endpoint for both bookmark upload and import dialog - [template] update bookmark import options - allow adding tags to all imported links - allow selecting the visibility (privacy) of imported links - [tests] ensure bookmarks are properly parsed and imported in the LinkDB - reuse reference input from the parser's test data See: - https://github.com/shaarli/netscape-bookmark-parser - https://getcomposer.org/doc/01-basic-usage.md#autoloading Signed-off-by: VirtualTam --- application/NetscapeBookmarkUtils.php | 142 +++++ index.php | 139 ++--- .../BookmarkExportTest.php} | 4 +- .../BookmarkImportTest.php | 518 ++++++++++++++++++ tests/NetscapeBookmarkUtils/input/empty.htm | 0 .../input/netscape_basic.htm | 11 + .../input/netscape_nested.htm | 31 ++ .../input/no_doctype.htm | 7 + .../NetscapeBookmarkUtils/input/same_date.htm | 11 + tpl/import.html | 38 +- 10 files changed, 779 insertions(+), 122 deletions(-) rename tests/{NetscapeBookmarkUtilsTest.php => NetscapeBookmarkUtils/BookmarkExportTest.php} (97%) create mode 100644 tests/NetscapeBookmarkUtils/BookmarkImportTest.php create mode 100644 tests/NetscapeBookmarkUtils/input/empty.htm create mode 100644 tests/NetscapeBookmarkUtils/input/netscape_basic.htm create mode 100644 tests/NetscapeBookmarkUtils/input/netscape_nested.htm create mode 100644 tests/NetscapeBookmarkUtils/input/no_doctype.htm create mode 100644 tests/NetscapeBookmarkUtils/input/same_date.htm diff --git a/application/NetscapeBookmarkUtils.php b/application/NetscapeBookmarkUtils.php index fdbb0ad..b99a432 100644 --- a/application/NetscapeBookmarkUtils.php +++ b/application/NetscapeBookmarkUtils.php @@ -51,4 +51,146 @@ class NetscapeBookmarkUtils return $bookmarkLinks; } + + /** + * Generates an import status summary + * + * @param string $filename name of the file to import + * @param int $filesize size of the file to import + * @param int $importCount how many links were imported + * @param int $overwriteCount how many links were overwritten + * @param int $skipCount how many links were skipped + * + * @return string Summary of the bookmark import status + */ + private static function importStatus( + $filename, + $filesize, + $importCount=0, + $overwriteCount=0, + $skipCount=0 + ) + { + $status = 'File '.$filename.' ('.$filesize.' bytes) '; + if ($importCount == 0 && $overwriteCount == 0 && $skipCount == 0) { + $status .= 'has an unknown file format. Nothing was imported.'; + } else { + $status .= 'was successfully processed: '.$importCount.' links imported, '; + $status .= $overwriteCount.' links overwritten, '; + $status .= $skipCount.' links skipped.'; + } + return $status; + } + + /** + * Imports Web bookmarks from an uploaded Netscape bookmark dump + * + * @param array $post Server $_POST parameters + * @param array $file Server $_FILES parameters + * @param LinkDB $linkDb Loaded LinkDB instance + * @param string $pagecache Page cache + * + * @return string Summary of the bookmark import status + */ + public static function import($post, $files, $linkDb, $pagecache) + { + $filename = $files['filetoupload']['name']; + $filesize = $files['filetoupload']['size']; + $data = file_get_contents($files['filetoupload']['tmp_name']); + + // Sniff file type + if (! startsWith($data, '')) { + return self::importStatus($filename, $filesize); + } + + // Overwrite existing links? + $overwrite = ! empty($post['overwrite']); + + // Add tags to all imported links? + if (empty($post['default_tags'])) { + $defaultTags = array(); + } else { + $defaultTags = preg_split( + '/[\s,]+/', + escape($post['default_tags']) + ); + } + + // links are imported as public by default + $defaultPrivacy = 0; + + $parser = new NetscapeBookmarkParser( + true, // nested tag support + $defaultTags, // additional user-specified tags + strval(1 - $defaultPrivacy) // defaultPub = 1 - defaultPrivacy + ); + $bookmarks = $parser->parseString($data); + + $importCount = 0; + $overwriteCount = 0; + $skipCount = 0; + + foreach ($bookmarks as $bkm) { + $private = $defaultPrivacy; + if (empty($post['privacy']) || $post['privacy'] == 'default') { + // use value from the imported file + $private = $bkm['pub'] == '1' ? 0 : 1; + } else if ($post['privacy'] == 'private') { + // all imported links are private + $private = 1; + } else if ($post['privacy'] == 'public') { + // all imported links are public + $private = 0; + } + + $newLink = array( + 'title' => $bkm['title'], + 'url' => $bkm['uri'], + 'description' => $bkm['note'], + 'private' => $private, + 'linkdate'=> '', + 'tags' => $bkm['tags'] + ); + + $existingLink = $linkDb->getLinkFromUrl($bkm['uri']); + + if ($existingLink !== false) { + if ($overwrite === false) { + // Do not overwrite an existing link + $skipCount++; + continue; + } + + // Overwrite an existing link, keep its date + $newLink['linkdate'] = $existingLink['linkdate']; + $linkDb[$existingLink['linkdate']] = $newLink; + $importCount++; + $overwriteCount++; + continue; + } + + // Add a new link + $newLinkDate = new DateTime('@'.strval($bkm['time'])); + while (!empty($linkDb[$newLinkDate->format(LinkDB::LINK_DATE_FORMAT)])) { + // Ensure the date/time is not already used + // - this hack is necessary as the date/time acts as a primary key + // - apply 1 second increments until an unused index is found + // See https://github.com/shaarli/Shaarli/issues/351 + $newLinkDate->add(new DateInterval('PT1S')); + } + $linkDbDate = $newLinkDate->format(LinkDB::LINK_DATE_FORMAT); + $newLink['linkdate'] = $linkDbDate; + $linkDb[$linkDbDate] = $newLink; + $importCount++; + } + + $linkDb->savedb($pagecache); + return self::importStatus( + $filename, + $filesize, + $importCount, + $overwriteCount, + $skipCount + ); + } } diff --git a/index.php b/index.php index 9ae798b..1f148d7 100644 --- a/index.php +++ b/index.php @@ -44,6 +44,10 @@ error_reporting(E_ALL^E_WARNING); //error_reporting(-1); +// 3rd-party libraries +require_once 'inc/rain.tpl.class.php'; +require_once __DIR__ . '/vendor/autoload.php'; + // Shaarli library require_once 'application/ApplicationUtils.php'; require_once 'application/Cache.php'; @@ -65,7 +69,6 @@ require_once 'application/Utils.php'; require_once 'application/PluginManager.php'; require_once 'application/Router.php'; require_once 'application/Updater.php'; -require_once 'inc/rain.tpl.class.php'; // Ensure the PHP version is supported try { @@ -1468,26 +1471,37 @@ function renderPage($conf, $pluginManager) exit; } - // -------- User is uploading a file for import - if (isset($_SERVER['QUERY_STRING']) && startsWith($_SERVER['QUERY_STRING'], 'do=upload')) - { - // If file is too big, some form field may be missing. - if (!isset($_POST['token']) || (!isset($_FILES)) || (isset($_FILES['filetoupload']['size']) && $_FILES['filetoupload']['size']==0)) - { - $returnurl = ( empty($_SERVER['HTTP_REFERER']) ? '?' : $_SERVER['HTTP_REFERER'] ); - echo ''; + if ($targetPage == Router::$PAGE_IMPORT) { + // Upload a Netscape bookmark dump to import its contents + + if (! isset($_POST['token']) || ! isset($_FILES['filetoupload'])) { + // Show import dialog + $PAGE->assign('maxfilesize', getMaxFileSize()); + $PAGE->renderPage('import'); exit; } - if (!tokenOk($_POST['token'])) die('Wrong token.'); - importFile($LINKSDB); - exit; - } - // -------- Show upload/import dialog: - if ($targetPage == Router::$PAGE_IMPORT) - { - $PAGE->assign('maxfilesize',getMaxFileSize()); - $PAGE->renderPage('import'); + // Import bookmarks from an uploaded file + if (isset($_FILES['filetoupload']['size']) && $_FILES['filetoupload']['size'] == 0) { + // The file is too big or some form field may be missing. + echo ''; + exit; + } + if (! tokenOk($_POST['token'])) { + die('Wrong token.'); + } + $status = NetscapeBookmarkUtils::import( + $_POST, + $_FILES, + $LINKSDB, + $conf->get('resource.page_cache') + ); + echo ''; exit; } @@ -1544,95 +1558,6 @@ function renderPage($conf, $pluginManager) exit; } -/** - * Process the import file form. - * - * @param LinkDB $LINKSDB Loaded LinkDB instance. - * @param ConfigManager $conf Configuration Manager instance. - */ -function importFile($LINKSDB, $conf) -{ - if (!isLoggedIn()) { die('Not allowed.'); } - - $filename=$_FILES['filetoupload']['name']; - $filesize=$_FILES['filetoupload']['size']; - $data=file_get_contents($_FILES['filetoupload']['tmp_name']); - $private = (empty($_POST['private']) ? 0 : 1); // Should the links be imported as private? - $overwrite = !empty($_POST['overwrite']) ; // Should the imported links overwrite existing ones? - $import_count=0; - - // Sniff file type: - $type='unknown'; - if (startsWith($data,'')) $type='netscape'; // Netscape bookmark file (aka Firefox). - - // Then import the bookmarks. - if ($type=='netscape') - { - // This is a standard Netscape-style bookmark file. - // This format is supported by all browsers (except IE, of course), also Delicious, Diigo and others. - foreach(explode('
',$data) as $html) // explode is very fast - { - $link = array('linkdate'=>'','title'=>'','url'=>'','description'=>'','tags'=>'','private'=>0); - $d = explode('
',$html); - if (startsWith($d[0], '(.*?)!i',$d[0],$matches); $link['title'] = (isset($matches[1]) ? trim($matches[1]) : ''); // Get title - $link['title'] = html_entity_decode($link['title'],ENT_QUOTES,'UTF-8'); - preg_match_all('! ([A-Z_]+)=\"(.*?)"!i',$html,$matches,PREG_SET_ORDER); // Get all other attributes - $raw_add_date=0; - foreach($matches as $m) - { - $attr=$m[1]; $value=$m[2]; - if ($attr=='HREF') $link['url']=html_entity_decode($value,ENT_QUOTES,'UTF-8'); - elseif ($attr=='ADD_DATE') - { - $raw_add_date=intval($value); - if ($raw_add_date>30000000000) $raw_add_date/=1000; //If larger than year 2920, then was likely stored in milliseconds instead of seconds - } - elseif ($attr=='PRIVATE') $link['private']=($value=='0'?0:1); - elseif ($attr=='TAGS') $link['tags']=html_entity_decode(str_replace(',',' ',$value),ENT_QUOTES,'UTF-8'); - } - if ($link['url']!='') - { - if ($private==1) $link['private']=1; - $dblink = $LINKSDB->getLinkFromUrl($link['url']); // See if the link is already in database. - if ($dblink==false) - { // Link not in database, let's import it... - if (empty($raw_add_date)) $raw_add_date=time(); // In case of shitty bookmark file with no ADD_DATE - - // Make sure date/time is not already used by another link. - // (Some bookmark files have several different links with the same ADD_DATE) - // We increment date by 1 second until we find a date which is not used in DB. - // (so that links that have the same date/time are more or less kept grouped by date, but do not conflict.) - while (!empty($LINKSDB[date('Ymd_His',$raw_add_date)])) { $raw_add_date++; }// Yes, I know it's ugly. - $link['linkdate']=date('Ymd_His',$raw_add_date); - $LINKSDB[$link['linkdate']] = $link; - $import_count++; - } - else // Link already present in database. - { - if ($overwrite) - { // If overwrite is required, we import link data, except date/time. - $link['linkdate']=$dblink['linkdate']; - $LINKSDB[$link['linkdate']] = $link; - $import_count++; - } - } - - } - } - } - $LINKSDB->savedb($conf->get('resource.page_cache')); - - echo ''; - } - else - { - echo ''; - } -} - /** * Template for the list of links (