size is the "real" number of bytes the dom was created from. * but for most purposes, it's a really good estimation. * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors. * Allow the user to tell us how much they trust the html. * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node. * This allows for us to find tags based on the text they contain. * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag. * Paperg: added parse_charset so that we know about the character set of the source document. * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection. * * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that. * PaperG (John Schlick) Added get_display_size for "IMG" tags. * * Licensed under The MIT License * Redistributions of files must retain the above copyright notice. * * @author S.C. Chen * @author John Schlick * @author Rus Carroll * @version Rev. 1.7 (214) * @package PlaceLocalInclude * @subpackage simple_html_dom */ /** * All of the Defines for the classes below. * @author S.C. Chen */ define('HDOM_TYPE_ELEMENT', 1); define('HDOM_TYPE_COMMENT', 2); define('HDOM_TYPE_TEXT', 3); define('HDOM_TYPE_ENDTAG', 4); define('HDOM_TYPE_ROOT', 5); define('HDOM_TYPE_UNKNOWN', 6); define('HDOM_QUOTE_DOUBLE', 0); define('HDOM_QUOTE_SINGLE', 1); define('HDOM_QUOTE_NO', 3); define('HDOM_INFO_BEGIN', 0); define('HDOM_INFO_END', 1); define('HDOM_INFO_QUOTE', 2); define('HDOM_INFO_SPACE', 3); define('HDOM_INFO_TEXT', 4); define('HDOM_INFO_INNER', 5); define('HDOM_INFO_OUTER', 6); define('HDOM_INFO_ENDSPACE',7); define('DEFAULT_TARGET_CHARSET', 'UTF-8'); define('DEFAULT_BR_TEXT', "\r\n"); define('DEFAULT_SPAN_TEXT', " "); define('MAX_FILE_SIZE', 600000); /** Contents between curly braces "{" and "}" are interpreted as text */ define('HDOM_SMARTY_AS_TEXT', 1); // helper functions // ----------------------------------------------------------------------------- // get html dom from file // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. function file_get_html($url, $use_include_path = false, $context=null, $offset = 0, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) { // Ensure maximum length is greater than zero if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } // We DO force the tags to be terminated. $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); // For sourceforge users: uncomment the next line and comment the retrieve_url_contents line 2 lines down if it is not already done. $contents = file_get_contents($url, $use_include_path, $context, $offset, $maxLen); // Paperg - use our own mechanism for getting the contents as we want to control the timeout. //$contents = retrieve_url_contents($url); if (empty($contents) || strlen($contents) > $maxLen) { return false; } // The second parameter can force the selectors to all be lowercase. $dom->load($contents, $lowercase, $stripRN); return $dom; } // get html dom from string function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) { $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); if (empty($str) || strlen($str) > MAX_FILE_SIZE) { $dom->clear(); return false; } $dom->load($str, $lowercase, $stripRN); return $dom; } // dump html dom tree function dump_html_tree($node, $show_attr=true, $deep=0) { $node->dump($node); } /** * simple html dom node * PaperG - added ability for "find" routine to lowercase the value of the selector. * PaperG - added $tag_start to track the start position of the tag in the total byte index * * @package PlaceLocalInclude */ class simple_html_dom_node { /** * Node type * * Default is {@see HDOM_TYPE_TEXT} * * @var int */ public $nodetype = HDOM_TYPE_TEXT; /** * Tag name * * Default is 'text' * * @var string */ public $tag = 'text'; /** * List of attributes * * @var array */ public $attr = array(); /** * List of child node objects * * @var array */ public $children = array(); public $nodes = array(); /** * The parent node object * * @var object|null */ public $parent = null; // The "info" array - see HDOM_INFO_... for what each element contains. public $_ = array(); /** * Start position of the tag in the document * * @var int */ public $tag_start = 0; /** * The DOM object * * @var object|null */ private $dom = null; /** * Construct new node object * * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes} */ function __construct($dom) { $this->dom = $dom; $dom->nodes[] = $this; } function __destruct() { $this->clear(); } function __toString() { return $this->outertext(); } // clean up memory due to php5 circular references memory leak... function clear() { $this->dom = null; $this->nodes = null; $this->parent = null; $this->children = null; } // dump node's tree function dump($show_attr=true, $deep=0) { $lead = str_repeat(' ', $deep); echo $lead.$this->tag; if ($show_attr && count($this->attr)>0) { echo '('; foreach ($this->attr as $k=>$v) echo "[$k]=>\"".$this->$k.'", '; echo ')'; } echo "\n"; if ($this->nodes) { foreach ($this->nodes as $c) { $c->dump($show_attr, $deep+1); } } } // Debugging function to dump a single dom node with a bunch of information about it. function dump_node($echo=true) { $string = $this->tag; if (count($this->attr)>0) { $string .= '('; foreach ($this->attr as $k=>$v) { $string .= "[$k]=>\"".$this->$k.'", '; } $string .= ')'; } if (count($this->_)>0) { $string .= ' $_ ('; foreach ($this->_ as $k=>$v) { if (is_array($v)) { $string .= "[$k]=>("; foreach ($v as $k2=>$v2) { $string .= "[$k2]=>\"".$v2.'", '; } $string .= ")"; } else { $string .= "[$k]=>\"".$v.'", '; } } $string .= ")"; } if (isset($this->text)) { $string .= " text: (" . $this->text . ")"; } $string .= " HDOM_INNER_INFO: '"; if (isset($node->_[HDOM_INFO_INNER])) { $string .= $node->_[HDOM_INFO_INNER] . "'"; } else { $string .= ' NULL '; } $string .= " children: " . count($this->children); $string .= " nodes: " . count($this->nodes); $string .= " tag_start: " . $this->tag_start; $string .= "\n"; if ($echo) { echo $string; return; } else { return $string; } } /** * Return or set parent node * * @param object|null $parent (optional) The parent node, `null` to return * the current parent node. * @return object|null The parent node */ function parent($parent=null) { // I am SURE that this doesn't work properly. // It fails to unset the current node from it's current parents nodes or children list first. if ($parent !== null) { $this->parent = $parent; $this->parent->nodes[] = $this; $this->parent->children[] = $this; } return $this->parent; } /** * @return bool True if the node has at least one child node */ function has_child() { return !empty($this->children); } /** * Get child node at specified index * * @param int $idx The index of the child node to return, `-1` to return all * child nodes. * @return object|array|null The child node at the specified index, all child * nodes or null if the index is invalid. */ function children($idx=-1) { if ($idx===-1) { return $this->children; } if (isset($this->children[$idx])) { return $this->children[$idx]; } return null; } /** * Get first child node * * @return object|null The first child node or null if the current node has * no child nodes. * * @todo Use `empty()` instead of `count()` to improve performance on large * arrays. */ function first_child() { if (count($this->children)>0) { return $this->children[0]; } return null; } /** * Get last child node * * @return object|null The last child node or null if the current node has * no child nodes. * * @todo Use `end()` to slightly improve performance on large arrays. */ function last_child() { if (($count=count($this->children))>0) { return $this->children[$count-1]; } return null; } /** * Get next sibling node * * @return object|null The sibling node or null if the current node has no * sibling nodes. */ function next_sibling() { if ($this->parent===null) { return null; } $idx = 0; $count = count($this->parent->children); while ($idx<$count && $this!==$this->parent->children[$idx]) { ++$idx; } if (++$idx>=$count) { return null; } return $this->parent->children[$idx]; } /** * Get previous sibling node * * @return object|null The sibling node or null if the current node has no * sibling nodes. */ function prev_sibling() { if ($this->parent===null) return null; $idx = 0; $count = count($this->parent->children); while ($idx<$count && $this!==$this->parent->children[$idx]) ++$idx; if (--$idx<0) return null; return $this->parent->children[$idx]; } /** * Traverse ancestors to the first matching tag. * * @param string $tag Tag to find * @return object|null First matching node in the DOM tree or null if no * match was found. * * @todo Null is returned implicitly by calling ->parent on the root node. * This behaviour could change at any time, rendering this function invalid. */ function find_ancestor_tag($tag) { global $debug_object; if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } // Start by including ourselves in the comparison. $returnDom = $this; while (!is_null($returnDom)) { if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); } if ($returnDom->tag == $tag) { break; } $returnDom = $returnDom->parent; } return $returnDom; } /** * Get node's inner text (everything inside the opening and closing tags) * * @return string */ function innertext() { if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); $ret = ''; foreach ($this->nodes as $n) $ret .= $n->outertext(); return $ret; } /** * Get node's outer text (everything including the opening and closing tags) * * @return string */ function outertext() { global $debug_object; if (is_object($debug_object)) { $text = ''; if ($this->tag == 'text') { if (!empty($this->text)) { $text = " with text: " . $this->text; } } $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); } if ($this->tag==='root') return $this->innertext(); // trigger callback if ($this->dom && $this->dom->callback!==null) { call_user_func_array($this->dom->callback, array($this)); } if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); // render begin tag if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); } else { $ret = ""; } // render inner text if (isset($this->_[HDOM_INFO_INNER])) { // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. if ($this->tag != "br") { $ret .= $this->_[HDOM_INFO_INNER]; } } else { if ($this->nodes) { foreach ($this->nodes as $n) { $ret .= $this->convert_text($n->outertext()); } } } // render end tag if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) $ret .= 'tag.'>'; return $ret; } /** * Get node's plain text (everything excluding all tags) * * @return string */ function text() { if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; switch ($this->nodetype) { case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); case HDOM_TYPE_COMMENT: return ''; case HDOM_TYPE_UNKNOWN: return ''; } if (strcasecmp($this->tag, 'script')===0) return ''; if (strcasecmp($this->tag, 'style')===0) return ''; $ret = ''; // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. // WHY is this happening? if (!is_null($this->nodes)) { foreach ($this->nodes as $n) { // Start paragraph after a blank line if ($n->tag == 'p') { $ret .= "\n\n"; } $ret .= $this->convert_text($n->text()); // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. if ($n->tag == "span") { $ret .= $this->dom->default_span_text; } } } return trim($ret); } /** * Get node's xml text (inner text as a CDATA section) * * @return string */ function xmltext() { $ret = $this->innertext(); $ret = str_ireplace('', '', $ret); return $ret; } // build node's text with tag function makeup() { // text, comment, unknown if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); $ret = '<'.$this->tag; $i = -1; foreach ($this->attr as $key=>$val) { ++$i; // skip removed attribute if ($val===null || $val===false) continue; $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; //no value attr: nowrap, checked selected... if ($val===true) $ret .= $key; else { switch ($this->_[HDOM_INFO_QUOTE][$i]) { case HDOM_QUOTE_DOUBLE: $quote = '"'; break; case HDOM_QUOTE_SINGLE: $quote = '\''; break; default: $quote = ''; } $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; } } $ret = $this->dom->restore_noise($ret); return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; } // find elements by css selector //PaperG - added ability for find to lowercase the value of the selector. function find($selector, $idx=null, $lowercase=false) { $selectors = $this->parse_selector($selector); if (($count=count($selectors))===0) return array(); $found_keys = array(); // find each selector for ($c=0; $c<$count; ++$c) { // The change on the below line was documented on the sourceforge code tracker id 2788009 // used to be: if (($levle=count($selectors[0]))===0) return array(); if (($levle=count($selectors[$c]))===0) return array(); if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); $head = array($this->_[HDOM_INFO_BEGIN]=>1); // handle descendant selectors, no recursive! for ($l=0; $l<$levle; ++$l) { $ret = array(); foreach ($head as $k=>$v) { $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; //PaperG - Pass this optional parameter on to the seek function. $n->seek($selectors[$c][$l], $ret, $lowercase); } $head = $ret; } foreach ($head as $k=>$v) { if (!isset($found_keys[$k])) { $found_keys[$k] = 1; } } } // sort keys ksort($found_keys); $found = array(); foreach ($found_keys as $k=>$v) $found[] = $this->dom->nodes[$k]; // return nth-element or array if (is_null($idx)) return $found; else if ($idx<0) $idx = count($found) + $idx; return (isset($found[$idx])) ? $found[$idx] : null; } // seek for given conditions // PaperG - added parameter to allow for case insensitive testing of the value of a selector. protected function seek($selector, &$ret, $lowercase=false) { global $debug_object; if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } list($tag, $key, $val, $exp, $no_key) = $selector; // xpath index if ($tag && $key && is_numeric($key)) { $count = 0; foreach ($this->children as $c) { if ($tag==='*' || $tag===$c->tag) { if (++$count==$key) { $ret[$c->_[HDOM_INFO_BEGIN]] = 1; return; } } } return; } $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; if ($end==0) { $parent = $this->parent; while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { $end -= 1; $parent = $parent->parent; } $end += $parent->_[HDOM_INFO_END]; } for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { $node = $this->dom->nodes[$i]; $pass = true; if ($tag==='*' && !$key) { if (in_array($node, $this->children, true)) $ret[$i] = 1; continue; } // compare tag if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} // compare key if ($pass && $key) { if ($no_key) { if (isset($node->attr[$key])) $pass=false; } else { if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false; } } // compare value if ($pass && $key && $val && $val!=='*') { // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? if ($key == "plaintext") { // $node->plaintext actually returns $node->text(); $nodeKeyValue = $node->text(); } else { // this is a normal search, we want the value of that attribute of the tag. $nodeKeyValue = $node->attr[$key]; } if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. if ($lowercase) { $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); } else { $check = $this->match($exp, $val, $nodeKeyValue); } if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));} // handle multiple class if (!$check && strcasecmp($key, 'class')===0) { foreach (explode(' ',$node->attr[$key]) as $k) { // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. if (!empty($k)) { if ($lowercase) { $check = $this->match($exp, strtolower($val), strtolower($k)); } else { $check = $this->match($exp, $val, $k); } if ($check) break; } } } if (!$check) $pass = false; } if ($pass) $ret[$i] = 1; unset($node); } // It's passed by reference so this is actually what this function returns. if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);} } protected function match($exp, $pattern, $value) { global $debug_object; if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} switch ($exp) { case '=': return ($value===$pattern); case '!=': return ($value!==$pattern); case '^=': return preg_match("/^".preg_quote($pattern,'/')."/", $value); case '$=': return preg_match("/".preg_quote($pattern,'/')."$/", $value); case '*=': if ($pattern[0]=='/') { return preg_match($pattern, $value); } return preg_match("/".$pattern."/i", $value); } return false; } protected function parse_selector($selector_string) { global $debug_object; if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} // pattern of CSS selectors, modified from mootools // Paperg: Add the colon to the attrbute, so that it properly finds like google does. // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. // farther study is required to determine of this should be documented or removed. // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; $pattern = "/([\w:\*-]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w:-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);} $selectors = array(); $result = array(); //print_r($matches); foreach ($matches as $m) { $m[0] = trim($m[0]); if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; // for browser generated xpath if ($m[1]==='tbody') continue; list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); if (!empty($m[2])) {$key='id'; $val=$m[2];} if (!empty($m[3])) {$key='class'; $val=$m[3];} if (!empty($m[4])) {$key=$m[4];} if (!empty($m[5])) {$exp=$m[5];} if (!empty($m[6])) {$val=$m[6];} // convert to lowercase if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} //elements that do NOT have the specified attribute if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} $result[] = array($tag, $key, $val, $exp, $no_key); if (trim($m[7])===',') { $selectors[] = $result; $result = array(); } } if (count($result)>0) $selectors[] = $result; return $selectors; } function __get($name) { if (isset($this->attr[$name])) { return $this->convert_text($this->attr[$name]); } switch ($name) { case 'outertext': return $this->outertext(); case 'innertext': return $this->innertext(); case 'plaintext': return $this->text(); case 'xmltext': return $this->xmltext(); default: return array_key_exists($name, $this->attr); } } function __set($name, $value) { global $debug_object; if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} switch ($name) { case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; case 'innertext': if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; return $this->_[HDOM_INFO_INNER] = $value; } if (!isset($this->attr[$name])) { $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; } $this->attr[$name] = $value; } function __isset($name) { switch ($name) { case 'outertext': return true; case 'innertext': return true; case 'plaintext': return true; } //no value attr: nowrap, checked selected... return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); } function __unset($name) { if (isset($this->attr[$name])) unset($this->attr[$name]); } // PaperG - Function to convert the text from one character set to another if the two sets are not the same. function convert_text($text) { global $debug_object; if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} $converted_text = $text; $sourceCharset = ""; $targetCharset = ""; if ($this->dom) { $sourceCharset = strtoupper($this->dom->_charset); $targetCharset = strtoupper($this->dom->_target_charset); } if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) { // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) { $converted_text = $text; } else { $converted_text = iconv($sourceCharset, $targetCharset, $text); } } // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. if ($targetCharset == 'UTF-8') { if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") { $converted_text = substr($converted_text, 3); } if (substr($converted_text, -3) == "\xef\xbb\xbf") { $converted_text = substr($converted_text, 0, -3); } } return $converted_text; } /** * Returns true if $string is valid UTF-8 and false otherwise. * * @param mixed $str String to be tested * @return boolean */ static function is_utf8($str) { $c=0; $b=0; $bits=0; $len=strlen($str); for($i=0; $i<$len; $i++) { $c=ord($str[$i]); if($c > 128) { if(($c >= 254)) return false; elseif($c >= 252) $bits=6; elseif($c >= 248) $bits=5; elseif($c >= 240) $bits=4; elseif($c >= 224) $bits=3; elseif($c >= 192) $bits=2; else return false; if(($i+$bits) > $len) return false; while($bits > 1) { $i++; $b=ord($str[$i]); if($b < 128 || $b > 191) return false; $bits--; } } } return true; } /* function is_utf8($string) { //this is buggy return (utf8_encode(utf8_decode($string)) == $string); } */ /** * Function to try a few tricks to determine the displayed size of an img on the page. * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. * * @author John Schlick * @version April 19 2012 * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. */ function get_display_size() { global $debug_object; $width = -1; $height = -1; if ($this->tag !== 'img') { return false; } // See if there is aheight or width attribute in the tag itself. if (isset($this->attr['width'])) { $width = $this->attr['width']; } if (isset($this->attr['height'])) { $height = $this->attr['height']; } // Now look for an inline style. if (isset($this->attr['style'])) { // Thanks to user gnarf from stackoverflow for this regular expression. $attributes = array(); preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); foreach ($matches as $match) { $attributes[$match[1]] = $match[2]; } // If there is a width in the style attributes: if (isset($attributes['width']) && $width == -1) { // check that the last two characters are px (pixels) if (strtolower(substr($attributes['width'], -2)) == 'px') { $proposed_width = substr($attributes['width'], 0, -2); // Now make sure that it's an integer and not something stupid. if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { $width = $proposed_width; } } } // If there is a width in the style attributes: if (isset($attributes['height']) && $height == -1) { // check that the last two characters are px (pixels) if (strtolower(substr($attributes['height'], -2)) == 'px') { $proposed_height = substr($attributes['height'], 0, -2); // Now make sure that it's an integer and not something stupid. if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { $height = $proposed_height; } } } } // Future enhancement: // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. // Far future enhancement // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width // Note that in this case, the class or id will have the img subselector for it to apply to the image. // ridiculously far future development // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. $result = array('height' => $height, 'width' => $width); return $result; } // camel naming conventions function getAllAttributes() {return $this->attr;} function getAttribute($name) {return $this->__get($name);} function setAttribute($name, $value) {$this->__set($name, $value);} function hasAttribute($name) {return $this->__isset($name);} function removeAttribute($name) {$this->__set($name, null);} function getElementById($id) {return $this->find("#$id", 0);} function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} function getElementByTagName($name) {return $this->find($name, 0);} function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} function parentNode() {return $this->parent();} function childNodes($idx=-1) {return $this->children($idx);} function firstChild() {return $this->first_child();} function lastChild() {return $this->last_child();} function nextSibling() {return $this->next_sibling();} function previousSibling() {return $this->prev_sibling();} function hasChildNodes() {return $this->has_child();} function nodeName() {return $this->tag;} function appendChild($node) {$node->parent($this); return $node;} } /** * simple html dom parser * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. * Paperg - change $size from protected to public so we can easily access it * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. * * @package PlaceLocalInclude */ class simple_html_dom { /** * The root node of the document * * @var object */ public $root = null; /** * List of nodes in the current DOM * * @var array */ public $nodes = array(); /** * Callback function to run for each element in the DOM. * * @var callable|null */ public $callback = null; /** * Indicates how tags and attributes are matched * * @var bool When set to **true** tags and attributes will be converted to * lowercase before matching. */ public $lowercase = false; /** * Original document size * * Holds the original document size. * * @var int */ public $original_size; /** * Current document size * * Holds the current document size. The document size is determined by the * string length of ({@see simple_html_dom::$doc}). * * _Note_: Using this variable is more efficient than calling `strlen($doc)` * * @var int * */ public $size; /** * Current position in the document * * @var int */ protected $pos; /** * The document * * @var string */ protected $doc; /** * Current character * * Holds the current character at position {@see simple_html_dom::$pos} in * the document {@see simple_html_dom::$doc} * * _Note_: Using this variable is more efficient than calling `substr($doc, $pos, 1)` * * @var string */ protected $char; protected $cursor; /** * Parent node of the next node detected by the parser * * @var object */ protected $parent; protected $noise = array(); /** * Tokens considered blank in HTML * * @var string */ protected $token_blank = " \t\r\n"; /** * Tokens to identify the equal sign for attributes, stopping either at the * closing tag ("/" i.e. "") or the end of an opening tag (">" i.e. * "") * * @var string */ protected $token_equal = ' =/>'; /** * Tokens to identify the end of a tag name. A tag name either ends on the * ending slash ("/" i.e. "") or whitespace ("\s\r\n\t") * * @var string */ protected $token_slash = " />\r\n\t"; /** * Tokens to identify the end of an attribute * * @var string */ protected $token_attr = ' >'; // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. public $_charset = ''; public $_target_charset = ''; /** * Innertext for
elements * * @var string */ protected $default_br_text = ""; /** * Suffix for elements * * @var string */ public $default_span_text = ""; /** * Defines a list of self-closing tags (Void elements) according to the HTML * Specification * * _Remarks_: * - Use `isset()` instead of `in_array()` on array elements to boost * performance about 30% * - Sort elements by name for better readability! * * @link https://www.w3.org/TR/html HTML Specification * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements */ protected $self_closing_tags = array( 'area'=>1, 'base'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1 ); /** * Defines a list of tags which - if closed - close all optional closing * elements within if they haven't been closed yet. (So, an element where * neither opening nor closing tag is omissible consistently closes every * optional closing element within) * * _Remarks_: * - Use `isset()` instead of `in_array()` on array elements to boost * performance about 30% * - Sort elements by name for better readability! */ protected $block_tags = array( 'body'=>1, 'div'=>1, 'form'=>1, 'root'=>1, 'span'=>1, 'table'=>1 ); /** * Defines elements whose end tag is omissible. * * * key = Name of an element whose end tag is omissible. * * value = Names of elements whose end tag is omissible, that are closed * by the current element. * * _Remarks_: * - Use `isset()` instead of `in_array()` on array elements to boost * performance about 30% * - Sort elements by name for better readability! * * **Example** * * An `li` element’s end tag may be omitted if the `li` element is immediately * followed by another `li` element. To do that, add following element to the * array: * * ```php * 'li' => array('li'), * ``` * * With this, the following two examples are considered equal. Note that the * second example is missing the closing tags on `li` elements. * * ```html *
  • First Item
  • Second Item
* ``` * *
  • First Item
  • Second Item
* * ```html *
  • First Item
  • Second Item
* ``` * *
  • First Item
  • Second Item
* * @var array A two-dimensional array where the key is the name of an * element whose end tag is omissible and the value is an array of elements * whose end tag is omissible, that are closed by the current element. * * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags * * @todo The implementation of optional closing tags doesn't work in all cases * because it only consideres elements who close other optional closing * tags, not taking into account that some (non-blocking) tags should close * these optional closing tags. For example, the end tag for "p" is omissible * and can be closed by an "address" element, whose end tag is NOT omissible. * Currently a "p" element without closing tag stops at the next "p" element * or blocking tag, even if it contains other elements. * * @todo Known sourceforge issue #2977341 * B tags that are not closed cause us to return everything to the end of * the document. */ protected $optional_closing_tags = array( 'b'=>array('b'=>1), // Not optional, see https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 'dd'=>array('dd'=>1, 'dt'=>1), 'dl'=>array('dd'=>1, 'dt'=>1), // Not optional, see https://www.w3.org/TR/html/grouping-content.html#the-dl-element 'dt'=>array('dd'=>1, 'dt'=>1), 'li'=>array('li'=>1), 'optgroup'=>array('optgroup'=>1, 'option'=>1), 'option'=>array('optgroup'=>1, 'option'=>1), 'p'=>array('p'=>1), 'rp'=>array('rp'=>1, 'rt'=>1), 'rt'=>array('rp'=>1, 'rt'=>1), 'td'=>array('td'=>1, 'th'=>1), 'th'=>array('td'=>1, 'th'=>1), 'tr'=>array('td'=>1, 'th'=>1, 'tr'=>1), ); function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0) { if ($str) { if (preg_match("/^http:\/\//i",$str) || is_file($str)) { $this->load_file($str); } else { $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText, $options); } } // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. if (!$forceTagsClosed) { $this->optional_closing_array=array(); } $this->_target_charset = $target_charset; } function __destruct() { $this->clear(); } // load html from string function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0) { global $debug_object; // prepare $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 // Script tags removal now preceeds style tag removal. // strip out