| 1 | <?php |
|---|
| 2 | |
|---|
| 3 | /** |
|---|
| 4 | * Parser that uses PHP 5's DOM extension (part of the core). |
|---|
| 5 | * |
|---|
| 6 | * In PHP 5, the DOM XML extension was revamped into DOM and added to the core. |
|---|
| 7 | * It gives us a forgiving HTML parser, which we use to transform the HTML |
|---|
| 8 | * into a DOM, and then into the tokens. It is blazingly fast (for large |
|---|
| 9 | * documents, it performs twenty times faster than |
|---|
| 10 | * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5. |
|---|
| 11 | * |
|---|
| 12 | * @note Any empty elements will have empty tokens associated with them, even if |
|---|
| 13 | * this is prohibited by the spec. This is cannot be fixed until the spec |
|---|
| 14 | * comes into play. |
|---|
| 15 | * |
|---|
| 16 | * @note PHP's DOM extension does not actually parse any entities, we use |
|---|
| 17 | * our own function to do that. |
|---|
| 18 | * |
|---|
| 19 | * @warning DOM tends to drop whitespace, which may wreak havoc on indenting. |
|---|
| 20 | * If this is a huge problem, due to the fact that HTML is hand |
|---|
| 21 | * edited and you are unable to get a parser cache that caches the |
|---|
| 22 | * the output of HTML Purifier while keeping the original HTML lying |
|---|
| 23 | * around, you may want to run Tidy on the resulting output or use |
|---|
| 24 | * HTMLPurifier_DirectLex |
|---|
| 25 | */ |
|---|
| 26 | |
|---|
| 27 | class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer |
|---|
| 28 | { |
|---|
| 29 | |
|---|
| 30 | private $factory; |
|---|
| 31 | |
|---|
| 32 | public function __construct() { |
|---|
| 33 | // setup the factory |
|---|
| 34 | parent::__construct(); |
|---|
| 35 | $this->factory = new HTMLPurifier_TokenFactory(); |
|---|
| 36 | } |
|---|
| 37 | |
|---|
| 38 | public function tokenizeHTML($html, $config, $context) { |
|---|
| 39 | |
|---|
| 40 | $html = $this->normalize($html, $config, $context); |
|---|
| 41 | |
|---|
| 42 | // attempt to armor stray angled brackets that cannot possibly |
|---|
| 43 | // form tags and thus are probably being used as emoticons |
|---|
| 44 | if ($config->get('Core', 'AggressivelyFixLt')) { |
|---|
| 45 | $char = '[^a-z!\/]'; |
|---|
| 46 | $comment = "/<!--(.*?)(-->|\z)/is"; |
|---|
| 47 | $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); |
|---|
| 48 | $html = preg_replace("/<($char)/i", '<\\1', $html); |
|---|
| 49 | $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments |
|---|
| 50 | } |
|---|
| 51 | |
|---|
| 52 | // preprocess html, essential for UTF-8 |
|---|
| 53 | $html = $this->wrapHTML($html, $config, $context); |
|---|
| 54 | |
|---|
| 55 | $doc = new DOMDocument(); |
|---|
| 56 | $doc->encoding = 'UTF-8'; // theoretically, the above has this covered |
|---|
| 57 | |
|---|
| 58 | set_error_handler(array($this, 'muteErrorHandler')); |
|---|
| 59 | $doc->loadHTML($html); |
|---|
| 60 | restore_error_handler(); |
|---|
| 61 | |
|---|
| 62 | $tokens = array(); |
|---|
| 63 | $this->tokenizeDOM( |
|---|
| 64 | $doc->getElementsByTagName('html')->item(0)-> // <html> |
|---|
| 65 | getElementsByTagName('body')->item(0)-> // <body> |
|---|
| 66 | getElementsByTagName('div')->item(0) // <div> |
|---|
| 67 | , $tokens); |
|---|
| 68 | return $tokens; |
|---|
| 69 | } |
|---|
| 70 | |
|---|
| 71 | /** |
|---|
| 72 | * Recursive function that tokenizes a node, putting it into an accumulator. |
|---|
| 73 | * |
|---|
| 74 | * @param $node DOMNode to be tokenized. |
|---|
| 75 | * @param $tokens Array-list of already tokenized tokens. |
|---|
| 76 | * @param $collect Says whether or start and close are collected, set to |
|---|
| 77 | * false at first recursion because it's the implicit DIV |
|---|
| 78 | * tag you're dealing with. |
|---|
| 79 | * @returns Tokens of node appended to previously passed tokens. |
|---|
| 80 | */ |
|---|
| 81 | protected function tokenizeDOM($node, &$tokens, $collect = false) { |
|---|
| 82 | |
|---|
| 83 | // intercept non element nodes. WE MUST catch all of them, |
|---|
| 84 | // but we're not getting the character reference nodes because |
|---|
| 85 | // those should have been preprocessed |
|---|
| 86 | if ($node->nodeType === XML_TEXT_NODE) { |
|---|
| 87 | $tokens[] = $this->factory->createText($node->data); |
|---|
| 88 | return; |
|---|
| 89 | } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { |
|---|
| 90 | // undo libxml's special treatment of <script> and <style> tags |
|---|
| 91 | $last = end($tokens); |
|---|
| 92 | $data = $node->data; |
|---|
| 93 | // (note $node->tagname is already normalized) |
|---|
| 94 | if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) { |
|---|
| 95 | $new_data = trim($data); |
|---|
| 96 | if (substr($new_data, 0, 4) === '<!--') { |
|---|
| 97 | $data = substr($new_data, 4); |
|---|
| 98 | if (substr($data, -3) === '-->') { |
|---|
| 99 | $data = substr($data, 0, -3); |
|---|
| 100 | } else { |
|---|
| 101 | // Highly suspicious! Not sure what to do... |
|---|
| 102 | } |
|---|
| 103 | } |
|---|
| 104 | } |
|---|
| 105 | $tokens[] = $this->factory->createText($this->parseData($data)); |
|---|
| 106 | return; |
|---|
| 107 | } elseif ($node->nodeType === XML_COMMENT_NODE) { |
|---|
| 108 | // this is code is only invoked for comments in script/style in versions |
|---|
| 109 | // of libxml pre-2.6.28 (regular comments, of course, are still |
|---|
| 110 | // handled regularly) |
|---|
| 111 | $tokens[] = $this->factory->createComment($node->data); |
|---|
| 112 | return; |
|---|
| 113 | } elseif ( |
|---|
| 114 | // not-well tested: there may be other nodes we have to grab |
|---|
| 115 | $node->nodeType !== XML_ELEMENT_NODE |
|---|
| 116 | ) { |
|---|
| 117 | return; |
|---|
| 118 | } |
|---|
| 119 | |
|---|
| 120 | $attr = $node->hasAttributes() ? |
|---|
| 121 | $this->transformAttrToAssoc($node->attributes) : |
|---|
| 122 | array(); |
|---|
| 123 | |
|---|
| 124 | // We still have to make sure that the element actually IS empty |
|---|
| 125 | if (!$node->childNodes->length) { |
|---|
| 126 | if ($collect) { |
|---|
| 127 | $tokens[] = $this->factory->createEmpty($node->tagName, $attr); |
|---|
| 128 | } |
|---|
| 129 | } else { |
|---|
| 130 | if ($collect) { // don't wrap on first iteration |
|---|
| 131 | $tokens[] = $this->factory->createStart( |
|---|
| 132 | $tag_name = $node->tagName, // somehow, it get's dropped |
|---|
| 133 | $attr |
|---|
| 134 | ); |
|---|
| 135 | } |
|---|
| 136 | foreach ($node->childNodes as $node) { |
|---|
| 137 | // remember, it's an accumulator. Otherwise, we'd have |
|---|
| 138 | // to use array_merge |
|---|
| 139 | $this->tokenizeDOM($node, $tokens, true); |
|---|
| 140 | } |
|---|
| 141 | if ($collect) { |
|---|
| 142 | $tokens[] = $this->factory->createEnd($tag_name); |
|---|
| 143 | } |
|---|
| 144 | } |
|---|
| 145 | |
|---|
| 146 | } |
|---|
| 147 | |
|---|
| 148 | /** |
|---|
| 149 | * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array. |
|---|
| 150 | * |
|---|
| 151 | * @param $attribute_list DOMNamedNodeMap of DOMAttr objects. |
|---|
| 152 | * @returns Associative array of attributes. |
|---|
| 153 | */ |
|---|
| 154 | protected function transformAttrToAssoc($node_map) { |
|---|
| 155 | // NamedNodeMap is documented very well, so we're using undocumented |
|---|
| 156 | // features, namely, the fact that it implements Iterator and |
|---|
| 157 | // has a ->length attribute |
|---|
| 158 | if ($node_map->length === 0) return array(); |
|---|
| 159 | $array = array(); |
|---|
| 160 | foreach ($node_map as $attr) { |
|---|
| 161 | $array[$attr->name] = $attr->value; |
|---|
| 162 | } |
|---|
| 163 | return $array; |
|---|
| 164 | } |
|---|
| 165 | |
|---|
| 166 | /** |
|---|
| 167 | * An error handler that mutes all errors |
|---|
| 168 | */ |
|---|
| 169 | public function muteErrorHandler($errno, $errstr) {} |
|---|
| 170 | |
|---|
| 171 | /** |
|---|
| 172 | * Callback function for undoing escaping of stray angled brackets |
|---|
| 173 | * in comments |
|---|
| 174 | */ |
|---|
| 175 | public function callbackUndoCommentSubst($matches) { |
|---|
| 176 | return '<!--' . strtr($matches[1], array('&'=>'&','<'=>'<')) . $matches[2]; |
|---|
| 177 | } |
|---|
| 178 | |
|---|
| 179 | /** |
|---|
| 180 | * Callback function that entity-izes ampersands in comments so that |
|---|
| 181 | * callbackUndoCommentSubst doesn't clobber them |
|---|
| 182 | */ |
|---|
| 183 | public function callbackArmorCommentEntities($matches) { |
|---|
| 184 | return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2]; |
|---|
| 185 | } |
|---|
| 186 | |
|---|
| 187 | /** |
|---|
| 188 | * Wraps an HTML fragment in the necessary HTML |
|---|
| 189 | */ |
|---|
| 190 | protected function wrapHTML($html, $config, $context) { |
|---|
| 191 | $def = $config->getDefinition('HTML'); |
|---|
| 192 | $ret = ''; |
|---|
| 193 | |
|---|
| 194 | if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) { |
|---|
| 195 | $ret .= '<!DOCTYPE html '; |
|---|
| 196 | if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" '; |
|---|
| 197 | if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" '; |
|---|
| 198 | $ret .= '>'; |
|---|
| 199 | } |
|---|
| 200 | |
|---|
| 201 | $ret .= '<html><head>'; |
|---|
| 202 | $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; |
|---|
| 203 | // No protection if $html contains a stray </div>! |
|---|
| 204 | $ret .= '</head><body><div>'.$html.'</div></body></html>'; |
|---|
| 205 | return $ret; |
|---|
| 206 | } |
|---|
| 207 | |
|---|
| 208 | } |
|---|
| 209 | |
|---|