From b00971b2c391f3ebcd279f3c8213f7303dc16091 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Tue, 11 Dec 2018 17:11:07 +0100 Subject: [PATCH] [simplehtmldom] Update parser to version 1.7 - Update parser to version 1.7 https://sourceforge.net/projects/simplehtmldom/files/simplehtmldom/1.7/ References #959 -------------------- CHANGELOG -------------------- - Added code documentation to improve readability - Added unit tests for `simple_html_dom::$self_closing_tags` - Added unit tests for `simple_html_dom::$optional_closing_tags` - Added unit tests for bug reports - Added test for bug [#56](https://sourceforge.net/p/simplehtmldom/bugs/56/) - Added test for bug [#97](https://sourceforge.net/p/simplehtmldom/bugs/97/) - Added test for bug [#116](https://sourceforge.net/p/simplehtmldom/bugs/116/) - Added test for bug [#121](https://sourceforge.net/p/simplehtmldom/bugs/127/) - Added test for bug [#127](https://sourceforge.net/p/simplehtmldom/bugs/127/) - Added test for bug [#154](https://sourceforge.net/p/simplehtmldom/bugs/154/) - Added test for bug [#160](https://sourceforge.net/p/simplehtmldom/bugs/160/) - Added unit tests for memory management of the parser - Added bit flags to `simple_html_dom::load()` - Added bit flag `HDOM_SMARTY_AS_TEXT` to optionally filter Smarty scripts (#154)\ **Note**: Smarty scripts are no longer filtered by default!\ - Added build script to automate releases - Added support for attributes without whitespace to separate them - Improved documentation and readability for `$self_closing_tags` - Improved documentation and readability for `$block_tags` - Improved documentation and readability for `$optional_closing_tags` - Updated list of `simple_html_dom::$self_closing_tags` - Removed 'spacer' (obsolete) - Added 'area' - Added 'col' - Added 'meta' - Added 'param' - Added 'source' - Added 'track' - Added 'wbr' - Updated list of `simple_html_dom::$optional_closing_tags` - Removed "nobr" (obsolete) - Added 'th' as closable element to 'td' - Added 'td' as closable element to 'th' - Added 'optgroup' with 'optgroup' and 'option' as closable elements - Added 'optgroup' as closable element to 'option' - Added 'rp' with 'rp' and 'rt' as closable elements - Added 'rt' with 'rt' and 'rp' as closable elements - Clarified meaning of `simple_html_dom->parent` - Changed default `$offset` for `file_get_html()` from -1 to 0 (#161) - Changed `simple_html_dom::load()` to remove script tags before replacing newline characters - `simple_html_dom_node::text()` no longer adds whitespace to top level span elements (only to sub-elements) - `simple_html_dom_node::text()` adds blank lines between paragraphs - Normalized line endings in the repository to LF via `.gitattributes` - Improved performance of `simple_html_dom::parse_charset()` by approximately 25% - Improved performance of `simple_html_dom::parse()` by approximately 10% - `str_get_html()` is deprecated and should be replaced by `new simple_html_dom()` - Removed protected function `simple_html_dom::copy_until_char_escaped()` - Fixed compatibility issues with PHP 7.3 - Fixed typo (#147) - Fixed handling of incorrectly escaped text (#160) - Restore functionality of `$maxLen` in `file_get_html()` - Fixed load_file breaks if an error ocurred in another script --- vendor/simplehtmldom/simple_html_dom.php | 760 ++++++++++++++++++----- 1 file changed, 595 insertions(+), 165 deletions(-) diff --git a/vendor/simplehtmldom/simple_html_dom.php b/vendor/simplehtmldom/simple_html_dom.php index b5d30898..676807d3 100644 --- a/vendor/simplehtmldom/simple_html_dom.php +++ b/vendor/simplehtmldom/simple_html_dom.php @@ -34,7 +34,7 @@ * @author S.C. Chen * @author John Schlick * @author Rus Carroll - * @version 1.5 ($Rev: 208 $) + * @version Rev. 1.7 (214) * @package PlaceLocalInclude * @subpackage simple_html_dom */ @@ -63,20 +63,27 @@ define('HDOM_INFO_ENDSPACE',7); define('DEFAULT_TARGET_CHARSET', 'UTF-8'); define('DEFAULT_BR_TEXT', "\r\n"); define('DEFAULT_SPAN_TEXT', " "); -define('MAX_FILE_SIZE', 10000000); +define('MAX_FILE_SIZE', 600000); + +/** Contents between curly braces "{" and "}" are interpreted as text */ +define('HDOM_SMARTY_AS_TEXT', 1); + // helper functions // ----------------------------------------------------------------------------- // get html dom from file // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. -function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) +function file_get_html($url, $use_include_path = false, $context=null, $offset = 0, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) { + // Ensure maximum length is greater than zero + if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } + // We DO force the tags to be terminated. $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); - // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done. - $contents = file_get_contents($url, $use_include_path, $context, $offset); + // For sourceforge users: uncomment the next line and comment the retrieve_url_contents line 2 lines down if it is not already done. + $contents = file_get_contents($url, $use_include_path, $context, $offset, $maxLen); // Paperg - use our own mechanism for getting the contents as we want to control the timeout. //$contents = retrieve_url_contents($url); - if (empty($contents) || strlen($contents) > MAX_FILE_SIZE) + if (empty($contents) || strlen($contents) > $maxLen) { return false; } @@ -114,17 +121,68 @@ function dump_html_tree($node, $show_attr=true, $deep=0) */ class simple_html_dom_node { + /** + * Node type + * + * Default is {@see HDOM_TYPE_TEXT} + * + * @var int + */ public $nodetype = HDOM_TYPE_TEXT; + + /** + * Tag name + * + * Default is 'text' + * + * @var string + */ public $tag = 'text'; + + /** + * List of attributes + * + * @var array + */ public $attr = array(); + + /** + * List of child node objects + * + * @var array + */ public $children = array(); public $nodes = array(); + + /** + * The parent node object + * + * @var object|null + */ public $parent = null; + // The "info" array - see HDOM_INFO_... for what each element contains. public $_ = array(); + + /** + * Start position of the tag in the document + * + * @var int + */ public $tag_start = 0; + + /** + * The DOM object + * + * @var object|null + */ private $dom = null; + /** + * Construct new node object + * + * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes} + */ function __construct($dom) { $this->dom = $dom; @@ -240,8 +298,13 @@ class simple_html_dom_node } } - // returns the parent of node - // If a node is passed in, it will reset the parent of the current node to that one. + /** + * Return or set parent node + * + * @param object|null $parent (optional) The parent node, `null` to return + * the current parent node. + * @return object|null The parent node + */ function parent($parent=null) { // I am SURE that this doesn't work properly. @@ -256,13 +319,22 @@ class simple_html_dom_node return $this->parent; } - // verify that node has children + /** + * @return bool True if the node has at least one child node + */ function has_child() { return !empty($this->children); } - // returns children of node + /** + * Get child node at specified index + * + * @param int $idx The index of the child node to return, `-1` to return all + * child nodes. + * @return object|array|null The child node at the specified index, all child + * nodes or null if the index is invalid. + */ function children($idx=-1) { if ($idx===-1) @@ -276,7 +348,15 @@ class simple_html_dom_node return null; } - // returns the first child of node + /** + * Get first child node + * + * @return object|null The first child node or null if the current node has + * no child nodes. + * + * @todo Use `empty()` instead of `count()` to improve performance on large + * arrays. + */ function first_child() { if (count($this->children)>0) @@ -286,7 +366,14 @@ class simple_html_dom_node return null; } - // returns the last child of node + /** + * Get last child node + * + * @return object|null The last child node or null if the current node has + * no child nodes. + * + * @todo Use `end()` to slightly improve performance on large arrays. + */ function last_child() { if (($count=count($this->children))>0) @@ -296,7 +383,12 @@ class simple_html_dom_node return null; } - // returns the next sibling of node + /** + * Get next sibling node + * + * @return object|null The sibling node or null if the current node has no + * sibling nodes. + */ function next_sibling() { if ($this->parent===null) @@ -317,7 +409,12 @@ class simple_html_dom_node return $this->parent->children[$idx]; } - // returns the previous sibling of node + /** + * Get previous sibling node + * + * @return object|null The sibling node or null if the current node has no + * sibling nodes. + */ function prev_sibling() { if ($this->parent===null) return null; @@ -329,7 +426,16 @@ class simple_html_dom_node return $this->parent->children[$idx]; } - // function to locate a specific ancestor tag in the path to the root. + /** + * Traverse ancestors to the first matching tag. + * + * @param string $tag Tag to find + * @return object|null First matching node in the DOM tree or null if no + * match was found. + * + * @todo Null is returned implicitly by calling ->parent on the root node. + * This behaviour could change at any time, rendering this function invalid. + */ function find_ancestor_tag($tag) { global $debug_object; @@ -351,7 +457,11 @@ class simple_html_dom_node return $returnDom; } - // get dom node's inner html + /** + * Get node's inner text (everything inside the opening and closing tags) + * + * @return string + */ function innertext() { if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; @@ -363,7 +473,11 @@ class simple_html_dom_node return $ret; } - // get dom node's outer text (with tag) + /** + * Get node's outer text (everything including the opening and closing tags) + * + * @return string + */ function outertext() { global $debug_object; @@ -423,7 +537,11 @@ class simple_html_dom_node return $ret; } - // get dom node's plain text + /** + * Get node's plain text (everything excluding all tags) + * + * @return string + */ function text() { if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; @@ -444,20 +562,29 @@ class simple_html_dom_node { foreach ($this->nodes as $n) { + // Start paragraph after a blank line + if ($n->tag == 'p') + { + $ret .= "\n\n"; + } + $ret .= $this->convert_text($n->text()); + + // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. + if ($n->tag == "span") + { + $ret .= $this->dom->default_span_text; + } } - - // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. - if ($this->tag == "span") - { - $ret .= $this->dom->default_span_text; - } - - } - return $ret; + return trim($ret); } + /** + * Get node's xml text (inner text as a CDATA section) + * + * @return string + */ function xmltext() { $ret = $this->innertext(); @@ -686,7 +813,7 @@ class simple_html_dom_node // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. // farther study is required to determine of this should be documented or removed. // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; - $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; + $pattern = "/([\w:\*-]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w:-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);} @@ -986,49 +1113,262 @@ class simple_html_dom_node */ class simple_html_dom { + /** + * The root node of the document + * + * @var object + */ public $root = null; + + /** + * List of nodes in the current DOM + * + * @var array + */ public $nodes = array(); + + /** + * Callback function to run for each element in the DOM. + * + * @var callable|null + */ public $callback = null; + + /** + * Indicates how tags and attributes are matched + * + * @var bool When set to **true** tags and attributes will be converted to + * lowercase before matching. + */ public $lowercase = false; - // Used to keep track of how large the text was when we started. + + /** + * Original document size + * + * Holds the original document size. + * + * @var int + */ public $original_size; + + /** + * Current document size + * + * Holds the current document size. The document size is determined by the + * string length of ({@see simple_html_dom::$doc}). + * + * _Note_: Using this variable is more efficient than calling `strlen($doc)` + * + * @var int + * */ public $size; + + /** + * Current position in the document + * + * @var int + */ protected $pos; + + /** + * The document + * + * @var string + */ protected $doc; + + /** + * Current character + * + * Holds the current character at position {@see simple_html_dom::$pos} in + * the document {@see simple_html_dom::$doc} + * + * _Note_: Using this variable is more efficient than calling `substr($doc, $pos, 1)` + * + * @var string + */ protected $char; + protected $cursor; + + /** + * Parent node of the next node detected by the parser + * + * @var object + */ protected $parent; protected $noise = array(); + + /** + * Tokens considered blank in HTML + * + * @var string + */ protected $token_blank = " \t\r\n"; + + /** + * Tokens to identify the equal sign for attributes, stopping either at the + * closing tag ("/" i.e. "") or the end of an opening tag (">" i.e. + * "") + * + * @var string + */ protected $token_equal = ' =/>'; + + /** + * Tokens to identify the end of a tag name. A tag name either ends on the + * ending slash ("/" i.e. "") or whitespace ("\s\r\n\t") + * + * @var string + */ protected $token_slash = " />\r\n\t"; + + /** + * Tokens to identify the end of an attribute + * + * @var string + */ protected $token_attr = ' >'; + // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. public $_charset = ''; public $_target_charset = ''; + + /** + * Innertext for
elements + * + * @var string + */ protected $default_br_text = ""; + + /** + * Suffix for elements + * + * @var string + */ public $default_span_text = ""; - // use isset instead of in_array, performance boost about 30%... - protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); - protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); - // Known sourceforge issue #2977341 - // B tags that are not closed cause us to return everything to the end of the document. - protected $optional_closing_tags = array( - 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), - 'th'=>array('th'=>1), - 'td'=>array('td'=>1), - 'li'=>array('li'=>1), - 'dt'=>array('dt'=>1, 'dd'=>1), - 'dd'=>array('dd'=>1, 'dt'=>1), - 'dl'=>array('dd'=>1, 'dt'=>1), - 'p'=>array('p'=>1), - 'nobr'=>array('nobr'=>1), - 'b'=>array('b'=>1), - 'option'=>array('option'=>1), + /** + * Defines a list of self-closing tags (Void elements) according to the HTML + * Specification + * + * _Remarks_: + * - Use `isset()` instead of `in_array()` on array elements to boost + * performance about 30% + * - Sort elements by name for better readability! + * + * @link https://www.w3.org/TR/html HTML Specification + * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements + */ + protected $self_closing_tags = array( + 'area'=>1, + 'base'=>1, + 'br'=>1, + 'col'=>1, + 'embed'=>1, + 'hr'=>1, + 'img'=>1, + 'input'=>1, + 'link'=>1, + 'meta'=>1, + 'param'=>1, + 'source'=>1, + 'track'=>1, + 'wbr'=>1 ); - function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) + /** + * Defines a list of tags which - if closed - close all optional closing + * elements within if they haven't been closed yet. (So, an element where + * neither opening nor closing tag is omissible consistently closes every + * optional closing element within) + * + * _Remarks_: + * - Use `isset()` instead of `in_array()` on array elements to boost + * performance about 30% + * - Sort elements by name for better readability! + */ + protected $block_tags = array( + 'body'=>1, + 'div'=>1, + 'form'=>1, + 'root'=>1, + 'span'=>1, + 'table'=>1 + ); + + /** + * Defines elements whose end tag is omissible. + * + * * key = Name of an element whose end tag is omissible. + * * value = Names of elements whose end tag is omissible, that are closed + * by the current element. + * + * _Remarks_: + * - Use `isset()` instead of `in_array()` on array elements to boost + * performance about 30% + * - Sort elements by name for better readability! + * + * **Example** + * + * An `li` element’s end tag may be omitted if the `li` element is immediately + * followed by another `li` element. To do that, add following element to the + * array: + * + * ```php + * 'li' => array('li'), + * ``` + * + * With this, the following two examples are considered equal. Note that the + * second example is missing the closing tags on `li` elements. + * + * ```html + *
  • First Item
  • Second Item
+ * ``` + * + *
  • First Item
  • Second Item
+ * + * ```html + *
  • First Item
  • Second Item
+ * ``` + * + *
  • First Item
  • Second Item
+ * + * @var array A two-dimensional array where the key is the name of an + * element whose end tag is omissible and the value is an array of elements + * whose end tag is omissible, that are closed by the current element. + * + * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags + * + * @todo The implementation of optional closing tags doesn't work in all cases + * because it only consideres elements who close other optional closing + * tags, not taking into account that some (non-blocking) tags should close + * these optional closing tags. For example, the end tag for "p" is omissible + * and can be closed by an "address" element, whose end tag is NOT omissible. + * Currently a "p" element without closing tag stops at the next "p" element + * or blocking tag, even if it contains other elements. + * + * @todo Known sourceforge issue #2977341 + * B tags that are not closed cause us to return everything to the end of + * the document. + */ + protected $optional_closing_tags = array( + 'b'=>array('b'=>1), // Not optional, see https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element + 'dd'=>array('dd'=>1, 'dt'=>1), + 'dl'=>array('dd'=>1, 'dt'=>1), // Not optional, see https://www.w3.org/TR/html/grouping-content.html#the-dl-element + 'dt'=>array('dd'=>1, 'dt'=>1), + 'li'=>array('li'=>1), + 'optgroup'=>array('optgroup'=>1, 'option'=>1), + 'option'=>array('optgroup'=>1, 'option'=>1), + 'p'=>array('p'=>1), + 'rp'=>array('rp'=>1, 'rt'=>1), + 'rt'=>array('rp'=>1, 'rt'=>1), + 'td'=>array('td'=>1, 'th'=>1), + 'th'=>array('td'=>1, 'th'=>1), + 'tr'=>array('td'=>1, 'th'=>1, 'tr'=>1), + ); + + function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0) { if ($str) { @@ -1038,7 +1378,7 @@ class simple_html_dom } else { - $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); + $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText, $options); } } // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. @@ -1054,21 +1394,32 @@ class simple_html_dom } // load html from string - function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) + function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0) { global $debug_object; // prepare - $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); - // strip out cdata - $this->remove_noise("''is", true); - // strip out comments - $this->remove_noise("''is"); + $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); + // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 // Script tags removal now preceeds style tag removal. // strip out