From 5656792cee3ddf4b38a60b6d05b5b08fa705a4c3 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 1 Jun 2019 19:45:28 +0200 Subject: [PATCH] [simplehtmldom] Update to version 1.9 Find the release notes at https://sourceforge.net/projects/simplehtmldom/files/simplehtmldom/1.9/ --- vendor/simplehtmldom/LICENSE | 21 + vendor/simplehtmldom/simple_html_dom.php | 1020 ++++++---------------- 2 files changed, 300 insertions(+), 741 deletions(-) create mode 100644 vendor/simplehtmldom/LICENSE diff --git a/vendor/simplehtmldom/LICENSE b/vendor/simplehtmldom/LICENSE new file mode 100644 index 00000000..6040f77b --- /dev/null +++ b/vendor/simplehtmldom/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 S.C. Chen, John Schlick, logmanoriginal + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/vendor/simplehtmldom/simple_html_dom.php b/vendor/simplehtmldom/simple_html_dom.php index c0001e3b..d30b018e 100644 --- a/vendor/simplehtmldom/simple_html_dom.php +++ b/vendor/simplehtmldom/simple_html_dom.php @@ -3,64 +3,24 @@ * Website: http://sourceforge.net/projects/simplehtmldom/ * Additional projects: http://sourceforge.net/projects/debugobject/ * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) - * Contributions by: - * Yousuke Kumakura (Attribute filters) - * Vadim Voituk (Negative indexes supports of "find" method) - * Antcs (Constructor with automatically load contents either text or file/url) - * - * all affected sections have comments starting with "PaperG" - * - * Paperg - Added case insensitive testing of the value of the selector. - * - * Paperg - Added tag_start for the starting index of tags - NOTE: This works - * but not accurately. This tag_start gets counted AFTER \r\n have been crushed - * out, and after the remove_noice calls so it will not reflect the REAL - * position of the tag in the source, it will almost always be smaller by some - * amount. We use this to determine how far into the file the tag in question - * is. This "percentage" will never be accurate as the $dom->size is the "real" - * number of bytes the dom was created from. But for most purposes, it's a - * really good estimation. - * - * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags - * closed is great for malformed html, but it CAN lead to parsing errors. - * - * Allow the user to tell us how much they trust the html. - * - * Paperg add the text and plaintext to the selectors for the find syntax. - * plaintext implies text in the innertext of a node. text implies that the - * tag is a text node. This allows for us to find tags based on the text they - * contain. - * - * Create find_ancestor_tag to see if a tag is - at any level - inside of - * another specific tag. - * - * Paperg: added parse_charset so that we know about the character set of - * the source document. NOTE: If the user's system has a routine called - * get_last_retrieve_url_contents_content_type availalbe, we will assume it's - * returning the content-type header from the last transfer or curl_exec, and - * we will parse that and use it in preference to any other method of charset - * detection. - * - * Found infinite loop in the case of broken html in restore_noise. Rewrote to - * protect from that. - * - * PaperG (John Schlick) Added get_display_size for "IMG" tags. * * Licensed under The MIT License - * Redistributions of files must retain the above copyright notice. + * See the LICENSE file in the project root for more information. * - * @author S.C. Chen - * @author John Schlick - * @author Rus Carroll - * @version Rev. 1.8.1 (247) - * @package PlaceLocalInclude - * @subpackage simple_html_dom + * Authors: + * S.C. Chen + * John Schlick + * Rus Carroll + * logmanoriginal + * + * Contributors: + * Yousuke Kumakura + * Vadim Voituk + * Antcs + * + * Version Rev. 1.9 (290) */ -/** - * All of the Defines for the classes below. - * @author S.C. Chen - */ define('HDOM_TYPE_ELEMENT', 1); define('HDOM_TYPE_COMMENT', 2); define('HDOM_TYPE_TEXT', 3); @@ -79,25 +39,12 @@ define('HDOM_INFO_INNER', 5); define('HDOM_INFO_OUTER', 6); define('HDOM_INFO_ENDSPACE', 7); -/** The default target charset */ defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); - -/** The default
text used instead of
tags when returning text */ defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); - -/** The default text used instead of tags when returning text */ defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); - -/** The maximum file size the parser should load */ defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); - -/** Contents between curly braces "{" and "}" are interpreted as text */ define('HDOM_SMARTY_AS_TEXT', 1); -// helper functions -// ----------------------------------------------------------------------------- -// get html dom from file -// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. function file_get_html( $url, $use_include_path = false, @@ -111,10 +58,8 @@ function file_get_html( $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) { - // Ensure maximum length is greater than zero if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } - // We DO force the tags to be terminated. $dom = new simple_html_dom( null, $lowercase, @@ -122,7 +67,8 @@ function file_get_html( $target_charset, $stripRN, $defaultBRText, - $defaultSpanText); + $defaultSpanText + ); /** * For sourceforge users: uncomment the next line and comment the @@ -133,19 +79,18 @@ function file_get_html( $use_include_path, $context, $offset, - $maxLen); - - // Paperg - use our own mechanism for getting the contents as we want to - // control the timeout. + $maxLen + ); // $contents = retrieve_url_contents($url); - if (empty($contents) || strlen($contents) > $maxLen) { return false; } - // The second parameter can force the selectors to all be lowercase. - $dom->load($contents, $lowercase, $stripRN); - return $dom; + if (empty($contents) || strlen($contents) > $maxLen) { + $dom->clear(); + return false; + } + + return $dom->load($contents, $lowercase, $stripRN); } -// get html dom from string function str_get_html( $str, $lowercase = true, @@ -162,97 +107,34 @@ function str_get_html( $target_charset, $stripRN, $defaultBRText, - $defaultSpanText); + $defaultSpanText + ); if (empty($str) || strlen($str) > MAX_FILE_SIZE) { $dom->clear(); return false; } - $dom->load($str, $lowercase, $stripRN); - return $dom; + return $dom->load($str, $lowercase, $stripRN); } -// dump html dom tree function dump_html_tree($node, $show_attr = true, $deep = 0) { $node->dump($node); } -/** - * simple html dom node - * PaperG - added ability for "find" routine to lowercase the value of the - * selector. - * - * PaperG - added $tag_start to track the start position of the tag in the total - * byte index - * - * @package PlaceLocalInclude - */ class simple_html_dom_node { - /** - * Node type - * - * Default is {@see HDOM_TYPE_TEXT} - * - * @var int - */ public $nodetype = HDOM_TYPE_TEXT; - - /** - * Tag name - * - * Default is 'text' - * - * @var string - */ public $tag = 'text'; - - /** - * List of attributes - * - * @var array - */ public $attr = array(); - - /** - * List of child node objects - * - * @var array - */ public $children = array(); public $nodes = array(); - - /** - * The parent node object - * - * @var object|null - */ public $parent = null; - - // The "info" array - see HDOM_INFO_... for what each element contains. public $_ = array(); - - /** - * Start position of the tag in the document - * - * @var int - */ public $tag_start = 0; - - /** - * The DOM object - * - * @var object|null - */ private $dom = null; - /** - * Construct new node object - * - * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes} - */ function __construct($dom) { $this->dom = $dom; @@ -269,7 +151,6 @@ class simple_html_dom_node return $this->outertext(); } - // clean up memory due to php5 circular references memory leak... function clear() { $this->dom = null; @@ -278,17 +159,14 @@ class simple_html_dom_node $this->children = null; } - // dump node's tree - function dump($show_attr = true, $deep = 0) + function dump($show_attr = true, $depth = 0) { - $lead = str_repeat(' ', $deep); - - echo $lead . $this->tag; + echo str_repeat("\t", $depth) . $this->tag; if ($show_attr && count($this->attr) > 0) { echo '('; foreach ($this->attr as $k => $v) { - echo "[$k]=>\"" . $this->$k . '", '; + echo "[$k]=>\"$v\", "; } echo ')'; } @@ -296,14 +174,12 @@ class simple_html_dom_node echo "\n"; if ($this->nodes) { - foreach ($this->nodes as $c) { - $c->dump($show_attr, $deep + 1); + foreach ($this->nodes as $node) { + $node->dump($show_attr, $depth + 1); } } } - - // Debugging function to dump a single dom node with a bunch of information about it. function dump_node($echo = true) { $string = $this->tag; @@ -311,7 +187,7 @@ class simple_html_dom_node if (count($this->attr) > 0) { $string .= '('; foreach ($this->attr as $k => $v) { - $string .= "[$k]=>\"" . $this->$k . '", '; + $string .= "[$k]=>\"$v\", "; } $string .= ')'; } @@ -322,24 +198,24 @@ class simple_html_dom_node if (is_array($v)) { $string .= "[$k]=>("; foreach ($v as $k2 => $v2) { - $string .= "[$k2]=>\"" . $v2 . '", '; + $string .= "[$k2]=>\"$v2\", "; } $string .= ')'; } else { - $string .= "[$k]=>\"" . $v . '", '; + $string .= "[$k]=>\"$v\", "; } } $string .= ')'; } if (isset($this->text)) { - $string .= ' text: (' . $this->text . ')'; + $string .= " text: ({$this->text})"; } - $string .= " HDOM_INNER_INFO: '"; + $string .= ' HDOM_INNER_INFO: '; if (isset($node->_[HDOM_INFO_INNER])) { - $string .= $node->_[HDOM_INFO_INNER] . "'"; + $string .= "'" . $node->_[HDOM_INFO_INNER] . "'"; } else { $string .= ' NULL '; } @@ -357,13 +233,6 @@ class simple_html_dom_node } } - /** - * Return or set parent node - * - * @param object|null $parent (optional) The parent node, `null` to return - * the current parent node. - * @return object|null The parent node - */ function parent($parent = null) { // I am SURE that this doesn't work properly. @@ -378,22 +247,11 @@ class simple_html_dom_node return $this->parent; } - /** - * @return bool True if the node has at least one child node - */ function has_child() { return !empty($this->children); } - /** - * Get child node at specified index - * - * @param int $idx The index of the child node to return, `-1` to return all - * child nodes. - * @return object|array|null The child node at the specified index, all child - * nodes or null if the index is invalid. - */ function children($idx = -1) { if ($idx === -1) { @@ -407,15 +265,6 @@ class simple_html_dom_node return null; } - /** - * Get first child node - * - * @return object|null The first child node or null if the current node has - * no child nodes. - * - * @todo Use `empty()` instead of `count()` to improve performance on large - * arrays. - */ function first_child() { if (count($this->children) > 0) { @@ -424,108 +273,70 @@ class simple_html_dom_node return null; } - /** - * Get last child node - * - * @return object|null The last child node or null if the current node has - * no child nodes. - * - * @todo Use `end()` to slightly improve performance on large arrays. - */ function last_child() { - if (($count = count($this->children)) > 0) { - return $this->children[$count - 1]; + if (count($this->children) > 0) { + return end($this->children); } return null; } - /** - * Get next sibling node - * - * @return object|null The sibling node or null if the current node has no - * sibling nodes. - */ function next_sibling() { if ($this->parent === null) { return null; } - $idx = 0; - $count = count($this->parent->children); + $idx = array_search($this, $this->parent->children, true); - while ($idx < $count && $this !== $this->parent->children[$idx]) { - ++$idx; + if ($idx !== false && isset($this->parent->children[$idx + 1])) { + return $this->parent->children[$idx + 1]; } - if (++$idx >= $count) { + return null; + } + + function prev_sibling() + { + if ($this->parent === null) { return null; } - return $this->parent->children[$idx]; - } + $idx = array_search($this, $this->parent->children, true); - /** - * Get previous sibling node - * - * @return object|null The sibling node or null if the current node has no - * sibling nodes. - */ - function prev_sibling() - { - if ($this->parent === null) { return null; } - - $idx = 0; - $count = count($this->parent->children); - - while ($idx < $count && $this !== $this->parent->children[$idx]) { - ++$idx; + if ($idx !== false && $idx > 0) { + return $this->parent->children[$idx - 1]; } - if (--$idx < 0) { return null; } - - return $this->parent->children[$idx]; + return null; } - /** - * Traverse ancestors to the first matching tag. - * - * @param string $tag Tag to find - * @return object|null First matching node in the DOM tree or null if no - * match was found. - * - * @todo Null is returned implicitly by calling ->parent on the root node. - * This behaviour could change at any time, rendering this function invalid. - */ function find_ancestor_tag($tag) { global $debug_object; if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } - // Start by including ourselves in the comparison. - $returnDom = $this; + if ($this->parent === null) { + return null; + } - while (!is_null($returnDom)) { + $ancestor = $this->parent; + + while (!is_null($ancestor)) { if (is_object($debug_object)) { - $debug_object->debug_log(2, 'Current tag is: ' . $returnDom->tag); + $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag); } - if ($returnDom->tag == $tag) { + if ($ancestor->tag === $tag) { break; } - $returnDom = $returnDom->parent; + $ancestor = $ancestor->parent; } - return $returnDom; + return $ancestor; } - /** - * Get node's inner text (everything inside the opening and closing tags) - * - * @return string - */ function innertext() { if (isset($this->_[HDOM_INFO_INNER])) { @@ -545,11 +356,6 @@ class simple_html_dom_node return $ret; } - /** - * Get node's outer text (everything including the opening and closing tags) - * - * @return string - */ function outertext() { global $debug_object; @@ -566,9 +372,11 @@ class simple_html_dom_node $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); } - if ($this->tag === 'root') return $this->innertext(); + if ($this->tag === 'root') { + return $this->innertext(); + } - // trigger callback + // todo: What is the use of this callback? Remove? if ($this->dom && $this->dom->callback !== null) { call_user_func_array($this->dom->callback, array($this)); } @@ -581,29 +389,23 @@ class simple_html_dom_node return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); } - // render begin tag + $ret = ''; + if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); - } else { - $ret = ''; } - // render inner text if (isset($this->_[HDOM_INFO_INNER])) { - // If it's a br tag... don't return the HDOM_INNER_INFO that we - // may or may not have added. + // todo:
should either never have HDOM_INFO_INNER or always if ($this->tag !== 'br') { $ret .= $this->_[HDOM_INFO_INNER]; } - } else { - if ($this->nodes) { - foreach ($this->nodes as $n) { - $ret .= $this->convert_text($n->outertext()); - } + } elseif ($this->nodes) { + foreach ($this->nodes as $n) { + $ret .= $this->convert_text($n->outertext()); } } - // render end tag if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { $ret .= 'tag . '>'; } @@ -611,11 +413,6 @@ class simple_html_dom_node return $ret; } - /** - * Get node's plain text (everything excluding all tags) - * - * @return string - */ function text() { if (isset($this->_[HDOM_INFO_INNER])) { @@ -642,7 +439,7 @@ class simple_html_dom_node foreach ($this->nodes as $n) { // Start paragraph after a blank line if ($n->tag === 'p') { - $ret .= "\n\n"; + $ret = trim($ret) . "\n\n"; } $ret .= $this->convert_text($n->text()); @@ -655,14 +452,9 @@ class simple_html_dom_node } } } - return trim($ret); + return $ret; } - /** - * Get node's xml text (inner text as a CDATA section) - * - * @return string - */ function xmltext() { $ret = $this->innertext(); @@ -671,7 +463,6 @@ class simple_html_dom_node return $ret; } - // build node's text with tag function makeup() { // text, comment, unknown @@ -715,18 +506,6 @@ class simple_html_dom_node return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; } - /** - * Find elements by CSS selector - * - * @param string $selector The CSS selector - * @param int|null $idx Index of element to return form the list of matching - * elements (default: `null` = disabled). - * @param bool $lowercase Matches tag names case insensitive (lowercase) if - * enabled (default: `false`) - * @return array|object|null A list of elements matching the specified CSS - * selector or a single element if $idx is specified or null if no element - * was found. - */ function find($selector, $idx = null, $lowercase = false) { $selectors = $this->parse_selector($selector); @@ -779,19 +558,6 @@ class simple_html_dom_node return (isset($found[$idx])) ? $found[$idx] : null; } - /** - * Seek DOM elements by selector - * - * **Note** - * The selector element must be compatible to a selector from - * {@see simple_html_dom_node::parse_selector()} - * - * @param array $selector A selector element - * @param array $ret An array of matches - * @param bool $lowercase Matches tag names case insensitive (lowercase) if - * enabled (default: `false`) - * @return void - */ protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) { global $debug_object; @@ -823,7 +589,8 @@ class simple_html_dom_node && $this->parent && in_array($this, $this->parent->children)) { // Next-Sibling Combinator $index = array_search($this, $this->parent->children, true) + 1; - $nodes[] = $this->parent->children[$index]; + if ($index < count($this->parent->children)) + $nodes[] = $this->parent->children[$index]; } elseif ($parent_cmd === '~' && $this->parent && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator @@ -1006,24 +773,6 @@ class simple_html_dom_node } } - /** - * Match value and pattern for a given CSS expression - * - * **Supported Expressions** - * - * | Expression | Description - * | ---------- | ----------- - * | `=` | $value and $pattern must be equal - * | `!=` | $value and $pattern must not be equal - * | `^=` | $value must start with $pattern - * | `$=` | $value must end with $pattern - * | `*=` | $value must contain $pattern - * - * @param string $exp The expression. - * @param string $pattern The pattern - * @param string $value The value - * @value bool True if $value matches $pattern - */ protected function match($exp, $pattern, $value, $case_sensitivity) { global $debug_object; @@ -1069,31 +818,6 @@ class simple_html_dom_node return false; } - /** - * Parse CSS selector - * - * @param string $selector_string CSS selector string - * @return array List of CSS selectors. The format depends on the type of - * selector: - * - * ```php - * - * array( // list of selectors (each separated by a comma), i.e. 'img, p, div' - * array( // list of combinator selectors, i.e. 'img > p > div' - * array( // selector element - * [0], // (string) The element tag - * [1], // (string) The element id - * [2], // (array) The element classes - * [3], // (array>) The list of attributes, each - * // with four elements: name, expression, value, inverted - * [4] // (string) The selector combinator (' ' | '>' | '+' | '~') - * ) - * ) - * ) - * ``` - * - * @link https://www.w3.org/TR/selectors/#compound Compound selector - */ protected function parse_selector($selector_string) { global $debug_object; @@ -1185,7 +909,7 @@ class simple_html_dom_node */ if($m[4] !== '') { preg_match_all( - "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s*?([iIsS])?)?\]/is", + "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", trim($m[4]), $attributes, PREG_SET_ORDER @@ -1285,8 +1009,6 @@ class simple_html_dom_node if (isset($this->attr[$name])) { unset($this->attr[$name]); } } - // PaperG - Function to convert the text from one character set to another - // if the two sets are not the same. function convert_text($text) { global $debug_object; @@ -1337,12 +1059,6 @@ class simple_html_dom_node return $converted_text; } - /** - * Returns true if $string is valid UTF-8 and false otherwise. - * - * @param mixed $str String to be tested - * @return boolean - */ static function is_utf8($str) { $c = 0; $b = 0; @@ -1370,16 +1086,6 @@ class simple_html_dom_node return true; } - /** - * Function to try a few tricks to determine the displayed size of an img on - * the page. NOTE: This will ONLY work on an IMG tag. Returns FALSE on all - * other tag types. - * - * @author John Schlick - * @version April 19 2012 - * @return array an array containing the 'height' and 'width' of the image - * on the page or -1 if we can't figure it out. - */ function get_display_size() { global $debug_object; @@ -1465,7 +1171,82 @@ class simple_html_dom_node return $result; } - // camel naming conventions + function save($filepath = '') + { + $ret = $this->outertext(); + + if ($filepath !== '') { + file_put_contents($filepath, $ret, LOCK_EX); + } + + return $ret; + } + + function addClass($class) + { + if (is_string($class)) { + $class = explode(' ', $class); + } + + if (is_array($class)) { + foreach($class as $c) { + if (isset($this->class)) { + if ($this->hasClass($c)) { + continue; + } else { + $this->class .= ' ' . $c; + } + } else { + $this->class = $c; + } + } + } else { + if (is_object($debug_object)) { + $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); + } + } + } + + function hasClass($class) + { + if (is_string($class)) { + if (isset($this->class)) { + return in_array($class, explode(' ', $this->class), true); + } + } else { + if (is_object($debug_object)) { + $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); + } + } + + return false; + } + + function removeClass($class = null) + { + if (!isset($this->class)) { + return; + } + + if (is_null($class)) { + $this->removeAttribute('class'); + return; + } + + if (is_string($class)) { + $class = explode(' ', $class); + } + + if (is_array($class)) { + $class = array_diff(explode(' ', $this->class), $class); + if (empty($class)) { + $this->removeAttribute('class'); + } else { + $this->class = implode(' ', $class); + } + } + } + function getAllAttributes() { return $this->attr; @@ -1491,6 +1272,44 @@ class simple_html_dom_node $this->__set($name, null); } + function remove() + { + if ($this->parent) { + $this->parent->removeChild($this); + } + } + + function removeChild($node) + { + $nidx = array_search($node, $this->nodes, true); + $cidx = array_search($node, $this->children, true); + $didx = array_search($node, $this->dom->nodes, true); + + if ($nidx !== false && $cidx !== false && $didx !== false) { + + foreach($node->children as $child) { + $node->removeChild($child); + } + + foreach($node->nodes as $entity) { + $enidx = array_search($entity, $node->nodes, true); + $edidx = array_search($entity, $node->dom->nodes, true); + + if ($enidx !== false && $edidx !== false) { + unset($node->nodes[$enidx]); + unset($node->dom->nodes[$edidx]); + } + } + + unset($this->nodes[$nidx]); + unset($this->children[$cidx]); + unset($this->dom->nodes[$didx]); + + $node->clear(); + + } + } + function getElementById($id) { return $this->find("#$id", 0); @@ -1559,170 +1378,34 @@ class simple_html_dom_node } -/** - * simple html dom parser - * - * Paperg - in the find routine: allow us to specify that we want case - * insensitive testing of the value of the selector. - * - * Paperg - change $size from protected to public so we can easily access it - * - * Paperg - added ForceTagsClosed in the constructor which tells us whether we - * trust the html or not. Default is to NOT trust it. - * - * @package PlaceLocalInclude - */ class simple_html_dom { - /** - * The root node of the document - * - * @var object - */ public $root = null; - - /** - * List of nodes in the current DOM - * - * @var array - */ public $nodes = array(); - - /** - * Callback function to run for each element in the DOM. - * - * @var callable|null - */ public $callback = null; - - /** - * Indicates how tags and attributes are matched - * - * @var bool When set to **true** tags and attributes will be converted to - * lowercase before matching. - */ public $lowercase = false; - - /** - * Original document size - * - * Holds the original document size. - * - * @var int - */ public $original_size; - - /** - * Current document size - * - * Holds the current document size. The document size is determined by the - * string length of ({@see simple_html_dom::$doc}). - * - * _Note_: Using this variable is more efficient than calling `strlen($doc)` - * - * @var int - * */ public $size; - /** - * Current position in the document - * - * @var int - */ protected $pos; - - /** - * The document - * - * @var string - */ protected $doc; - - /** - * Current character - * - * Holds the current character at position {@see simple_html_dom::$pos} in - * the document {@see simple_html_dom::$doc} - * - * _Note_: Using this variable is more efficient than calling - * `substr($doc, $pos, 1)` - * - * @var string - */ protected $char; protected $cursor; - - /** - * Parent node of the next node detected by the parser - * - * @var object - */ protected $parent; protected $noise = array(); - - /** - * Tokens considered blank in HTML - * - * @var string - */ protected $token_blank = " \t\r\n"; - - /** - * Tokens to identify the equal sign for attributes, stopping either at the - * closing tag ("/" i.e. "") or the end of an opening tag (">" i.e. - * "") - * - * @var string - */ protected $token_equal = ' =/>'; - - /** - * Tokens to identify the end of a tag name. A tag name either ends on the - * ending slash ("/" i.e. "") or whitespace ("\s\r\n\t") - * - * @var string - */ protected $token_slash = " />\r\n\t"; - - /** - * Tokens to identify the end of an attribute - * - * @var string - */ protected $token_attr = ' >'; - // Note that this is referenced by a child node, and so it needs to be - // public for that node to see this information. public $_charset = ''; public $_target_charset = ''; - /** - * Innertext for
elements - * - * @var string - */ protected $default_br_text = ''; - /** - * Suffix for elements - * - * @var string - */ public $default_span_text = ''; - /** - * Defines a list of self-closing tags (Void elements) according to the HTML - * Specification - * - * _Remarks_: - * - Use `isset()` instead of `in_array()` on array elements to boost - * performance about 30% - * - Sort elements by name for better readability! - * - * @link https://www.w3.org/TR/html HTML Specification - * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements - */ protected $self_closing_tags = array( 'area' => 1, 'base' => 1, @@ -1739,18 +1422,6 @@ class simple_html_dom 'track' => 1, 'wbr' => 1 ); - - /** - * Defines a list of tags which - if closed - close all optional closing - * elements within if they haven't been closed yet. (So, an element where - * neither opening nor closing tag is omissible consistently closes every - * optional closing element within) - * - * _Remarks_: - * - Use `isset()` instead of `in_array()` on array elements to boost - * performance about 30% - * - Sort elements by name for better readability! - */ protected $block_tags = array( 'body' => 1, 'div' => 1, @@ -1759,62 +1430,6 @@ class simple_html_dom 'span' => 1, 'table' => 1 ); - - /** - * Defines elements whose end tag is omissible. - * - * * key = Name of an element whose end tag is omissible. - * * value = Names of elements whose end tag is omissible, that are closed - * by the current element. - * - * _Remarks_: - * - Use `isset()` instead of `in_array()` on array elements to boost - * performance about 30% - * - Sort elements by name for better readability! - * - * **Example** - * - * An `li` element’s end tag may be omitted if the `li` element is immediately - * followed by another `li` element. To do that, add following element to the - * array: - * - * ```php - * 'li' => array('li'), - * ``` - * - * With this, the following two examples are considered equal. Note that the - * second example is missing the closing tags on `li` elements. - * - * ```html - *
  • First Item
  • Second Item
- * ``` - * - *
  • First Item
  • Second Item
- * - * ```html - *
  • First Item
  • Second Item
- * ``` - * - *
  • First Item
  • Second Item
- * - * @var array A two-dimensional array where the key is the name of an - * element whose end tag is omissible and the value is an array of elements - * whose end tag is omissible, that are closed by the current element. - * - * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags - * - * @todo The implementation of optional closing tags doesn't work in all cases - * because it only consideres elements who close other optional closing - * tags, not taking into account that some (non-blocking) tags should close - * these optional closing tags. For example, the end tag for "p" is omissible - * and can be closed by an "address" element, whose end tag is NOT omissible. - * Currently a "p" element without closing tag stops at the next "p" element - * or blocking tag, even if it contains other elements. - * - * @todo Known sourceforge issue #2977341 - * B tags that are not closed cause us to return everything to the end of - * the document. - */ protected $optional_closing_tags = array( // Not optional, see // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element @@ -1873,7 +1488,6 @@ class simple_html_dom $this->clear(); } - // load html from string function load( $str, $lowercase = true, @@ -1928,7 +1542,6 @@ class simple_html_dom return $this; } - // load html from file function load_file() { $args = func_get_args(); @@ -1940,29 +1553,16 @@ class simple_html_dom } } - /** - * Set the callback function - * - * @param callable $function_name Callback function to run for each element - * in the DOM. - * @return void - */ function set_callback($function_name) { $this->callback = $function_name; } - /** - * Remove callback function - * - * @return void - */ function remove_callback() { $this->callback = null; } - // save dom as string function save($filepath = '') { $ret = $this->root->innertext(); @@ -1970,18 +1570,18 @@ class simple_html_dom return $ret; } - // find dom node by css selector - // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. function find($selector, $idx = null, $lowercase = false) { return $this->root->find($selector, $idx, $lowercase); } - // clean up memory due to php5 circular references memory leak... function clear() { - foreach ($this->nodes as $n) { - $n->clear(); $n = null; + if (isset($this->nodes)) { + foreach ($this->nodes as $n) { + $n->clear(); + $n = null; + } } // This add next line is documented in the sourceforge repository. @@ -1989,7 +1589,8 @@ class simple_html_dom // use of clear. if (isset($this->children)) { foreach ($this->children as $n) { - $n->clear(); $n = null; + $n->clear(); + $n = null; } } @@ -2012,7 +1613,6 @@ class simple_html_dom $this->root->dump($show_attr); } - // prepare HTML data and init everything protected function prepare( $str, $lowercase = true, $defaultBRText = DEFAULT_BR_TEXT, @@ -2038,11 +1638,6 @@ class simple_html_dom if ($this->size > 0) { $this->char = $this->doc[0]; } } - /** - * Parse HTML content - * - * @return bool True on success - */ protected function parse() { while (true) { @@ -2064,13 +1659,6 @@ class simple_html_dom } } - // PAPERG - dkchou - added this to try to identify the character set of the - // page we have just parsed so we know better how to spit it out later. - // NOTE: IF you provide a routine called - // get_last_retrieve_url_contents_content_type which returns the - // CURLINFO_CONTENT_TYPE from the last curl_exec - // (or the content_type header from the last transfer), we will parse THAT, - // and if a charset is specified, we will use it over any other mechanism. protected function parse_charset() { global $debug_object; @@ -2092,6 +1680,7 @@ class simple_html_dom } if (empty($charset)) { + // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); if (!empty($el)) { @@ -2128,53 +1717,77 @@ class simple_html_dom } } - // If we couldn't find a charset above, then lets try to detect one - // based on the text we got... if (empty($charset)) { - // Use this in case mb_detect_charset isn't installed/loaded on - // this machine. - $charset = false; - if (function_exists('mb_detect_encoding')) { - // Have php try to detect the encoding from the text given to us. - $charset = mb_detect_encoding( - $this->doc . 'ascii', - $encoding_list = array( 'UTF-8', 'CP1252' ) - ); - + // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration + if ($meta = $this->root->find('meta[charset]', 0)) { + $charset = $meta->charset; if (is_object($debug_object)) { - $debug_object->debug_log(2, 'mb_detect found: ' . $charset); + $debug_object->debug_log(2, 'meta charset: ' . $charset); } } + } - // and if this doesn't work... then we need to just wrongheadedly - // assume it's UTF-8 so that we can move on - cause this will - // usually give us most of what we need... - if ($charset === false) { - if (is_object($debug_object)) { - $debug_object->debug_log( - 2, - 'since mb_detect failed - using default of utf-8' - ); + if (empty($charset)) { + // Try to guess the charset based on the content + // Requires Multibyte String (mbstring) support (optional) + if (function_exists('mb_detect_encoding')) { + /** + * mb_detect_encoding() is not intended to distinguish between + * charsets, especially single-byte charsets. Its primary + * purpose is to detect which multibyte encoding is in use, + * i.e. UTF-8, UTF-16, shift-JIS, etc. + * + * -- https://bugs.php.net/bug.php?id=38138 + * + * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will + * always result in CP1251/ISO-8859-5 and vice versa. + * + * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 + * to stay compatible. + */ + $encoding = mb_detect_encoding( + $this->doc, + array( 'UTF-8', 'CP1252', 'ISO-8859-1' ) + ); + + if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') { + // Due to a limitation of mb_detect_encoding + // 'CP1251'/'ISO-8859-5' will be detected as + // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in + // which case we can simply assume it is the other charset. + if (!@iconv('CP1252', 'UTF-8', $this->doc)) { + $encoding = 'CP1251'; + } } - $charset = 'UTF-8'; + if ($encoding !== false) { + $charset = $encoding; + if (is_object($debug_object)) { + $debug_object->debug_log(2, 'mb_detect: ' . $charset); + } + } + } + } + + if (empty($charset)) { + // Assume it's UTF-8 as it is the most likely charset to be used + $charset = 'UTF-8'; + if (is_object($debug_object)) { + $debug_object->debug_log(2, 'No match found, assume ' . $charset); } } // Since CP1252 is a superset, if we get one of it's subsets, we want // it instead. - if ((strtolower($charset) == strtolower('ISO-8859-1')) - || (strtolower($charset) == strtolower('Latin1')) - || (strtolower($charset) == strtolower('Latin-1'))) { - + if ((strtolower($charset) == 'iso-8859-1') + || (strtolower($charset) == 'latin1') + || (strtolower($charset) == 'latin-1')) { + $charset = 'CP1252'; if (is_object($debug_object)) { - $debug_object->debug_log( - 2, + $debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset' ); } - - $charset = 'CP1252'; } if (is_object($debug_object)) { @@ -2184,11 +1797,6 @@ class simple_html_dom return $this->_charset = $charset; } - /** - * Parse tag from current document position. - * - * @return bool True if a tag was found, false otherwise - */ protected function read_tag() { // Set end position if no further tags found @@ -2467,63 +2075,50 @@ class simple_html_dom return true; } - /** - * Parse attribute from current document position - * - * @param object $node Node for the attributes - * @param string $name Name of the current attribute - * @param array $space Array for spacing information - * @return void - */ protected function parse_attr($node, $name, &$space) { - // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 - // If the attribute is already defined inside a tag, only pay attention - // to the first one as opposed to the last one. - // https://stackoverflow.com/a/26341866 - if (isset($node->attr[$name])) { - return; - } + $is_duplicate = isset($node->attr[$name]); - // [2] Whitespace between "=" and the value - $space[2] = $this->copy_skip($this->token_blank); + if (!$is_duplicate) // Copy whitespace between "=" and value + $space[2] = $this->copy_skip($this->token_blank); switch ($this->char) { - case '"': // value is anything between double quotes - $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; + case '"': + $quote_type = HDOM_QUOTE_DOUBLE; $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next - $node->attr[$name] = $this->restore_noise($this->copy_until_char('"')); + $value = $this->copy_until_char('"'); $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next break; - case '\'': // value is anything between single quotes - $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; + case '\'': + $quote_type = HDOM_QUOTE_SINGLE; $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next - $node->attr[$name] = $this->restore_noise($this->copy_until_char('\'')); + $value = $this->copy_until_char('\''); $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next break; - default: // value is anything until the first space or end tag - $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; - $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); + default: + $quote_type = HDOM_QUOTE_NO; + $value = $this->copy_until($this->token_attr); } + + $value = $this->restore_noise($value); + // PaperG: Attributes should not have \r or \n in them, that counts as // html whitespace. - $node->attr[$name] = str_replace("\r", '', $node->attr[$name]); - $node->attr[$name] = str_replace("\n", '', $node->attr[$name]); + $value = str_replace("\r", '', $value); + $value = str_replace("\n", '', $value); + // PaperG: If this is a "class" selector, lets get rid of the preceeding // and trailing space since some people leave it in the multi class case. if ($name === 'class') { - $node->attr[$name] = trim($node->attr[$name]); + $value = trim($value); + } + + if (!$is_duplicate) { + $node->_[HDOM_INFO_QUOTE][] = $quote_type; + $node->attr[$name] = $value; } } - /** - * Link node to parent node - * - * @param object $node Node to link to parent - * @param bool $is_child True if the node is a child of parent - * @return void - */ - // link node's parent protected function link_nodes(&$node, $is_child) { $node->parent = $this->parent; @@ -2533,12 +2128,6 @@ class simple_html_dom } } - /** - * Add tag as text node to current node - * - * @param string $tag Tag name - * @return bool True on success - */ protected function as_text_node($tag) { $node = new simple_html_dom_node($this); @@ -2549,28 +2138,12 @@ class simple_html_dom return true; } - /** - * Seek from the current document position to the first occurrence of a - * character not defined by the provided string. Update the current document - * position to the new position. - * - * @param string $chars A string containing every allowed character. - * @return void - */ protected function skip($chars) { $this->pos += strspn($this->doc, $chars, $this->pos); $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next } - /** - * Copy substring from the current document position to the first occurrence - * of a character not defined by the provided string. - * - * @param string $chars A string containing every allowed character. - * @return string Substring from the current document position to the first - * occurrence of a character not defined by the provided string. - */ protected function copy_skip($chars) { $pos = $this->pos; @@ -2581,14 +2154,6 @@ class simple_html_dom return substr($this->doc, $pos, $len); } - /** - * Copy substring from the current document position to the first occurrence - * of any of the provided characters. - * - * @param string $chars A string containing every character to stop at. - * @return string Substring from the current document position to the first - * occurrence of any of the provided characters. - */ protected function copy_until($chars) { $pos = $this->pos; @@ -2598,14 +2163,6 @@ class simple_html_dom return substr($this->doc, $pos, $len); } - /** - * Copy substring from the current document position to the first occurrence - * of the provided string. - * - * @param string $char The string to stop at. - * @return string Substring from the current document position to the first - * occurrence of the provided string. - */ protected function copy_until_char($char) { if ($this->char === null) { return ''; } @@ -2625,15 +2182,6 @@ class simple_html_dom return substr($this->doc, $pos_old, $pos - $pos_old); } - /** - * Remove noise from HTML content - * - * Noise is stored to {@see simple_html_dom::$noise} - * - * @param string $pattern The regex pattern used for finding noise - * @param bool $remove_tag True to remove the entire match. Default is false - * to only remove the captured data. - */ protected function remove_noise($pattern, $remove_tag = false) { global $debug_object; @@ -2666,14 +2214,6 @@ class simple_html_dom } } - /** - * Restore noise to HTML content - * - * Noise is restored from {@see simple_html_dom::$noise} - * - * @param string $text A subset of HTML containing noise - * @return string The same content with noise restored - */ function restore_noise($text) { global $debug_object; @@ -2720,7 +2260,6 @@ class simple_html_dom return $text; } - // Sometimes we NEED one of the noise elements. function search_noise($text) { global $debug_object; @@ -2754,7 +2293,6 @@ class simple_html_dom } } - // camel naming conventions function childNodes($idx = -1) { return $this->root->childNodes($idx); @@ -2772,7 +2310,7 @@ class simple_html_dom function createElement($name, $value = null) { - return @str_get_html("<$name>$value")->first_child(); + return @str_get_html("<$name>$value")->firstChild(); } function createTextNode($value)