| 1 | <?php |
|---|
| 2 | |
|---|
| 3 | /** |
|---|
| 4 | * Our in-house implementation of a parser. |
|---|
| 5 | * |
|---|
| 6 | * A pure PHP parser, DirectLex has absolutely no dependencies, making |
|---|
| 7 | * it a reasonably good default for PHP4. Written with efficiency in mind, |
|---|
| 8 | * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it |
|---|
| 9 | * pales in comparison to HTMLPurifier_Lexer_DOMLex. |
|---|
| 10 | * |
|---|
| 11 | * @todo Reread XML spec and document differences. |
|---|
| 12 | */ |
|---|
| 13 | class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer |
|---|
| 14 | { |
|---|
| 15 | |
|---|
| 16 | /** |
|---|
| 17 | * Whitespace characters for str(c)spn. |
|---|
| 18 | */ |
|---|
| 19 | protected $_whitespace = "\x20\x09\x0D\x0A"; |
|---|
| 20 | |
|---|
| 21 | /** |
|---|
| 22 | * Callback function for script CDATA fudge |
|---|
| 23 | * @param $matches, in form of array(opening tag, contents, closing tag) |
|---|
| 24 | */ |
|---|
| 25 | protected function scriptCallback($matches) { |
|---|
| 26 | return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3]; |
|---|
| 27 | } |
|---|
| 28 | |
|---|
| 29 | public function tokenizeHTML($html, $config, $context) { |
|---|
| 30 | |
|---|
| 31 | // special normalization for script tags without any armor |
|---|
| 32 | // our "armor" heurstic is a < sign any number of whitespaces after |
|---|
| 33 | // the first script tag |
|---|
| 34 | if ($config->get('HTML', 'Trusted')) { |
|---|
| 35 | $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si', |
|---|
| 36 | array($this, 'scriptCallback'), $html); |
|---|
| 37 | } |
|---|
| 38 | |
|---|
| 39 | $html = $this->normalize($html, $config, $context); |
|---|
| 40 | |
|---|
| 41 | $cursor = 0; // our location in the text |
|---|
| 42 | $inside_tag = false; // whether or not we're parsing the inside of a tag |
|---|
| 43 | $array = array(); // result array |
|---|
| 44 | |
|---|
| 45 | $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers'); |
|---|
| 46 | |
|---|
| 47 | if ($maintain_line_numbers === null) { |
|---|
| 48 | // automatically determine line numbering by checking |
|---|
| 49 | // if error collection is on |
|---|
| 50 | $maintain_line_numbers = $config->get('Core', 'CollectErrors'); |
|---|
| 51 | } |
|---|
| 52 | |
|---|
| 53 | if ($maintain_line_numbers) $current_line = 1; |
|---|
| 54 | else $current_line = false; |
|---|
| 55 | $context->register('CurrentLine', $current_line); |
|---|
| 56 | $nl = "\n"; |
|---|
| 57 | // how often to manually recalculate. This will ALWAYS be right, |
|---|
| 58 | // but it's pretty wasteful. Set to 0 to turn off |
|---|
| 59 | $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval'); |
|---|
| 60 | |
|---|
| 61 | $e = false; |
|---|
| 62 | if ($config->get('Core', 'CollectErrors')) { |
|---|
| 63 | $e =& $context->get('ErrorCollector'); |
|---|
| 64 | } |
|---|
| 65 | |
|---|
| 66 | // for testing synchronization |
|---|
| 67 | $loops = 0; |
|---|
| 68 | |
|---|
| 69 | while(++$loops) { |
|---|
| 70 | |
|---|
| 71 | // recalculate lines |
|---|
| 72 | if ( |
|---|
| 73 | $maintain_line_numbers && // line number tracking is on |
|---|
| 74 | $synchronize_interval && // synchronization is on |
|---|
| 75 | $cursor > 0 && // cursor is further than zero |
|---|
| 76 | $loops % $synchronize_interval === 0 // time to synchronize! |
|---|
| 77 | ) { |
|---|
| 78 | $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); |
|---|
| 79 | } |
|---|
| 80 | |
|---|
| 81 | $position_next_lt = strpos($html, '<', $cursor); |
|---|
| 82 | $position_next_gt = strpos($html, '>', $cursor); |
|---|
| 83 | |
|---|
| 84 | // triggers on "<b>asdf</b>" but not "asdf <b></b>" |
|---|
| 85 | // special case to set up context |
|---|
| 86 | if ($position_next_lt === $cursor) { |
|---|
| 87 | $inside_tag = true; |
|---|
| 88 | $cursor++; |
|---|
| 89 | } |
|---|
| 90 | |
|---|
| 91 | if (!$inside_tag && $position_next_lt !== false) { |
|---|
| 92 | // We are not inside tag and there still is another tag to parse |
|---|
| 93 | $token = new |
|---|
| 94 | HTMLPurifier_Token_Text( |
|---|
| 95 | $this->parseData( |
|---|
| 96 | substr( |
|---|
| 97 | $html, $cursor, $position_next_lt - $cursor |
|---|
| 98 | ) |
|---|
| 99 | ) |
|---|
| 100 | ); |
|---|
| 101 | if ($maintain_line_numbers) { |
|---|
| 102 | $token->line = $current_line; |
|---|
| 103 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); |
|---|
| 104 | } |
|---|
| 105 | $array[] = $token; |
|---|
| 106 | $cursor = $position_next_lt + 1; |
|---|
| 107 | $inside_tag = true; |
|---|
| 108 | continue; |
|---|
| 109 | } elseif (!$inside_tag) { |
|---|
| 110 | // We are not inside tag but there are no more tags |
|---|
| 111 | // If we're already at the end, break |
|---|
| 112 | if ($cursor === strlen($html)) break; |
|---|
| 113 | // Create Text of rest of string |
|---|
| 114 | $token = new |
|---|
| 115 | HTMLPurifier_Token_Text( |
|---|
| 116 | $this->parseData( |
|---|
| 117 | substr( |
|---|
| 118 | $html, $cursor |
|---|
| 119 | ) |
|---|
| 120 | ) |
|---|
| 121 | ); |
|---|
| 122 | if ($maintain_line_numbers) $token->line = $current_line; |
|---|
| 123 | $array[] = $token; |
|---|
| 124 | break; |
|---|
| 125 | } elseif ($inside_tag && $position_next_gt !== false) { |
|---|
| 126 | // We are in tag and it is well formed |
|---|
| 127 | // Grab the internals of the tag |
|---|
| 128 | $strlen_segment = $position_next_gt - $cursor; |
|---|
| 129 | |
|---|
| 130 | if ($strlen_segment < 1) { |
|---|
| 131 | // there's nothing to process! |
|---|
| 132 | $token = new HTMLPurifier_Token_Text('<'); |
|---|
| 133 | $cursor++; |
|---|
| 134 | continue; |
|---|
| 135 | } |
|---|
| 136 | |
|---|
| 137 | $segment = substr($html, $cursor, $strlen_segment); |
|---|
| 138 | |
|---|
| 139 | if ($segment === false) { |
|---|
| 140 | // somehow, we attempted to access beyond the end of |
|---|
| 141 | // the string, defense-in-depth, reported by Nate Abele |
|---|
| 142 | break; |
|---|
| 143 | } |
|---|
| 144 | |
|---|
| 145 | // Check if it's a comment |
|---|
| 146 | if ( |
|---|
| 147 | substr($segment, 0, 3) === '!--' |
|---|
| 148 | ) { |
|---|
| 149 | // re-determine segment length, looking for --> |
|---|
| 150 | $position_comment_end = strpos($html, '-->', $cursor); |
|---|
| 151 | if ($position_comment_end === false) { |
|---|
| 152 | // uh oh, we have a comment that extends to |
|---|
| 153 | // infinity. Can't be helped: set comment |
|---|
| 154 | // end position to end of string |
|---|
| 155 | if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment'); |
|---|
| 156 | $position_comment_end = strlen($html); |
|---|
| 157 | $end = true; |
|---|
| 158 | } else { |
|---|
| 159 | $end = false; |
|---|
| 160 | } |
|---|
| 161 | $strlen_segment = $position_comment_end - $cursor; |
|---|
| 162 | $segment = substr($html, $cursor, $strlen_segment); |
|---|
| 163 | $token = new |
|---|
| 164 | HTMLPurifier_Token_Comment( |
|---|
| 165 | substr( |
|---|
| 166 | $segment, 3, $strlen_segment - 3 |
|---|
| 167 | ) |
|---|
| 168 | ); |
|---|
| 169 | if ($maintain_line_numbers) { |
|---|
| 170 | $token->line = $current_line; |
|---|
| 171 | $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); |
|---|
| 172 | } |
|---|
| 173 | $array[] = $token; |
|---|
| 174 | $cursor = $end ? $position_comment_end : $position_comment_end + 3; |
|---|
| 175 | $inside_tag = false; |
|---|
| 176 | continue; |
|---|
| 177 | } |
|---|
| 178 | |
|---|
| 179 | // Check if it's an end tag |
|---|
| 180 | $is_end_tag = (strpos($segment,'/') === 0); |
|---|
| 181 | if ($is_end_tag) { |
|---|
| 182 | $type = substr($segment, 1); |
|---|
| 183 | $token = new HTMLPurifier_Token_End($type); |
|---|
| 184 | if ($maintain_line_numbers) { |
|---|
| 185 | $token->line = $current_line; |
|---|
| 186 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); |
|---|
| 187 | } |
|---|
| 188 | $array[] = $token; |
|---|
| 189 | $inside_tag = false; |
|---|
| 190 | $cursor = $position_next_gt + 1; |
|---|
| 191 | continue; |
|---|
| 192 | } |
|---|
| 193 | |
|---|
| 194 | // Check leading character is alnum, if not, we may |
|---|
| 195 | // have accidently grabbed an emoticon. Translate into |
|---|
| 196 | // text and go our merry way |
|---|
| 197 | if (!ctype_alpha($segment[0])) { |
|---|
| 198 | // XML: $segment[0] !== '_' && $segment[0] !== ':' |
|---|
| 199 | if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt'); |
|---|
| 200 | $token = new |
|---|
| 201 | HTMLPurifier_Token_Text( |
|---|
| 202 | '<' . |
|---|
| 203 | $this->parseData( |
|---|
| 204 | $segment |
|---|
| 205 | ) . |
|---|
| 206 | '>' |
|---|
| 207 | ); |
|---|
| 208 | if ($maintain_line_numbers) { |
|---|
| 209 | $token->line = $current_line; |
|---|
| 210 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); |
|---|
| 211 | } |
|---|
| 212 | $array[] = $token; |
|---|
| 213 | $cursor = $position_next_gt + 1; |
|---|
| 214 | $inside_tag = false; |
|---|
| 215 | continue; |
|---|
| 216 | } |
|---|
| 217 | |
|---|
| 218 | // Check if it is explicitly self closing, if so, remove |
|---|
| 219 | // trailing slash. Remember, we could have a tag like <br>, so |
|---|
| 220 | // any later token processing scripts must convert improperly |
|---|
| 221 | // classified EmptyTags from StartTags. |
|---|
| 222 | $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1); |
|---|
| 223 | if ($is_self_closing) { |
|---|
| 224 | $strlen_segment--; |
|---|
| 225 | $segment = substr($segment, 0, $strlen_segment); |
|---|
| 226 | } |
|---|
| 227 | |
|---|
| 228 | // Check if there are any attributes |
|---|
| 229 | $position_first_space = strcspn($segment, $this->_whitespace); |
|---|
| 230 | |
|---|
| 231 | if ($position_first_space >= $strlen_segment) { |
|---|
| 232 | if ($is_self_closing) { |
|---|
| 233 | $token = new HTMLPurifier_Token_Empty($segment); |
|---|
| 234 | } else { |
|---|
| 235 | $token = new HTMLPurifier_Token_Start($segment); |
|---|
| 236 | } |
|---|
| 237 | if ($maintain_line_numbers) { |
|---|
| 238 | $token->line = $current_line; |
|---|
| 239 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); |
|---|
| 240 | } |
|---|
| 241 | $array[] = $token; |
|---|
| 242 | $inside_tag = false; |
|---|
| 243 | $cursor = $position_next_gt + 1; |
|---|
| 244 | continue; |
|---|
| 245 | } |
|---|
| 246 | |
|---|
| 247 | // Grab out all the data |
|---|
| 248 | $type = substr($segment, 0, $position_first_space); |
|---|
| 249 | $attribute_string = |
|---|
| 250 | trim( |
|---|
| 251 | substr( |
|---|
| 252 | $segment, $position_first_space |
|---|
| 253 | ) |
|---|
| 254 | ); |
|---|
| 255 | if ($attribute_string) { |
|---|
| 256 | $attr = $this->parseAttributeString( |
|---|
| 257 | $attribute_string |
|---|
| 258 | , $config, $context |
|---|
| 259 | ); |
|---|
| 260 | } else { |
|---|
| 261 | $attr = array(); |
|---|
| 262 | } |
|---|
| 263 | |
|---|
| 264 | if ($is_self_closing) { |
|---|
| 265 | $token = new HTMLPurifier_Token_Empty($type, $attr); |
|---|
| 266 | } else { |
|---|
| 267 | $token = new HTMLPurifier_Token_Start($type, $attr); |
|---|
| 268 | } |
|---|
| 269 | if ($maintain_line_numbers) { |
|---|
| 270 | $token->line = $current_line; |
|---|
| 271 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); |
|---|
| 272 | } |
|---|
| 273 | $array[] = $token; |
|---|
| 274 | $cursor = $position_next_gt + 1; |
|---|
| 275 | $inside_tag = false; |
|---|
| 276 | continue; |
|---|
| 277 | } else { |
|---|
| 278 | // inside tag, but there's no ending > sign |
|---|
| 279 | if ($e) $e->send(E_WARNING, 'Lexer: Missing gt'); |
|---|
| 280 | $token = new |
|---|
| 281 | HTMLPurifier_Token_Text( |
|---|
| 282 | '<' . |
|---|
| 283 | $this->parseData( |
|---|
| 284 | substr($html, $cursor) |
|---|
| 285 | ) |
|---|
| 286 | ); |
|---|
| 287 | if ($maintain_line_numbers) $token->line = $current_line; |
|---|
| 288 | // no cursor scroll? Hmm... |
|---|
| 289 | $array[] = $token; |
|---|
| 290 | break; |
|---|
| 291 | } |
|---|
| 292 | break; |
|---|
| 293 | } |
|---|
| 294 | |
|---|
| 295 | $context->destroy('CurrentLine'); |
|---|
| 296 | return $array; |
|---|
| 297 | } |
|---|
| 298 | |
|---|
| 299 | /** |
|---|
| 300 | * PHP 5.0.x compatible substr_count that implements offset and length |
|---|
| 301 | */ |
|---|
| 302 | protected function substrCount($haystack, $needle, $offset, $length) { |
|---|
| 303 | static $oldVersion; |
|---|
| 304 | if ($oldVersion === null) { |
|---|
| 305 | $oldVersion = version_compare(PHP_VERSION, '5.1', '<'); |
|---|
| 306 | } |
|---|
| 307 | if ($oldVersion) { |
|---|
| 308 | $haystack = substr($haystack, $offset, $length); |
|---|
| 309 | return substr_count($haystack, $needle); |
|---|
| 310 | } else { |
|---|
| 311 | return substr_count($haystack, $needle, $offset, $length); |
|---|
| 312 | } |
|---|
| 313 | } |
|---|
| 314 | |
|---|
| 315 | /** |
|---|
| 316 | * Takes the inside of an HTML tag and makes an assoc array of attributes. |
|---|
| 317 | * |
|---|
| 318 | * @param $string Inside of tag excluding name. |
|---|
| 319 | * @returns Assoc array of attributes. |
|---|
| 320 | */ |
|---|
| 321 | public function parseAttributeString($string, $config, $context) { |
|---|
| 322 | $string = (string) $string; // quick typecast |
|---|
| 323 | |
|---|
| 324 | if ($string == '') return array(); // no attributes |
|---|
| 325 | |
|---|
| 326 | $e = false; |
|---|
| 327 | if ($config->get('Core', 'CollectErrors')) { |
|---|
| 328 | $e =& $context->get('ErrorCollector'); |
|---|
| 329 | } |
|---|
| 330 | |
|---|
| 331 | // let's see if we can abort as quickly as possible |
|---|
| 332 | // one equal sign, no spaces => one attribute |
|---|
| 333 | $num_equal = substr_count($string, '='); |
|---|
| 334 | $has_space = strpos($string, ' '); |
|---|
| 335 | if ($num_equal === 0 && !$has_space) { |
|---|
| 336 | // bool attribute |
|---|
| 337 | return array($string => $string); |
|---|
| 338 | } elseif ($num_equal === 1 && !$has_space) { |
|---|
| 339 | // only one attribute |
|---|
| 340 | list($key, $quoted_value) = explode('=', $string); |
|---|
| 341 | $quoted_value = trim($quoted_value); |
|---|
| 342 | if (!$key) { |
|---|
| 343 | if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); |
|---|
| 344 | return array(); |
|---|
| 345 | } |
|---|
| 346 | if (!$quoted_value) return array($key => ''); |
|---|
| 347 | $first_char = @$quoted_value[0]; |
|---|
| 348 | $last_char = @$quoted_value[strlen($quoted_value)-1]; |
|---|
| 349 | |
|---|
| 350 | $same_quote = ($first_char == $last_char); |
|---|
| 351 | $open_quote = ($first_char == '"' || $first_char == "'"); |
|---|
| 352 | |
|---|
| 353 | if ( $same_quote && $open_quote) { |
|---|
| 354 | // well behaved |
|---|
| 355 | $value = substr($quoted_value, 1, strlen($quoted_value) - 2); |
|---|
| 356 | } else { |
|---|
| 357 | // not well behaved |
|---|
| 358 | if ($open_quote) { |
|---|
| 359 | if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote'); |
|---|
| 360 | $value = substr($quoted_value, 1); |
|---|
| 361 | } else { |
|---|
| 362 | $value = $quoted_value; |
|---|
| 363 | } |
|---|
| 364 | } |
|---|
| 365 | if ($value === false) $value = ''; |
|---|
| 366 | return array($key => $value); |
|---|
| 367 | } |
|---|
| 368 | |
|---|
| 369 | // setup loop environment |
|---|
| 370 | $array = array(); // return assoc array of attributes |
|---|
| 371 | $cursor = 0; // current position in string (moves forward) |
|---|
| 372 | $size = strlen($string); // size of the string (stays the same) |
|---|
| 373 | |
|---|
| 374 | // if we have unquoted attributes, the parser expects a terminating |
|---|
| 375 | // space, so let's guarantee that there's always a terminating space. |
|---|
| 376 | $string .= ' '; |
|---|
| 377 | |
|---|
| 378 | while(true) { |
|---|
| 379 | |
|---|
| 380 | if ($cursor >= $size) { |
|---|
| 381 | break; |
|---|
| 382 | } |
|---|
| 383 | |
|---|
| 384 | $cursor += ($value = strspn($string, $this->_whitespace, $cursor)); |
|---|
| 385 | // grab the key |
|---|
| 386 | |
|---|
| 387 | $key_begin = $cursor; //we're currently at the start of the key |
|---|
| 388 | |
|---|
| 389 | // scroll past all characters that are the key (not whitespace or =) |
|---|
| 390 | $cursor += strcspn($string, $this->_whitespace . '=', $cursor); |
|---|
| 391 | |
|---|
| 392 | $key_end = $cursor; // now at the end of the key |
|---|
| 393 | |
|---|
| 394 | $key = substr($string, $key_begin, $key_end - $key_begin); |
|---|
| 395 | |
|---|
| 396 | if (!$key) { |
|---|
| 397 | if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); |
|---|
| 398 | $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop |
|---|
| 399 | continue; // empty key |
|---|
| 400 | } |
|---|
| 401 | |
|---|
| 402 | // scroll past all whitespace |
|---|
| 403 | $cursor += strspn($string, $this->_whitespace, $cursor); |
|---|
| 404 | |
|---|
| 405 | if ($cursor >= $size) { |
|---|
| 406 | $array[$key] = $key; |
|---|
| 407 | break; |
|---|
| 408 | } |
|---|
| 409 | |
|---|
| 410 | // if the next character is an equal sign, we've got a regular |
|---|
| 411 | // pair, otherwise, it's a bool attribute |
|---|
| 412 | $first_char = @$string[$cursor]; |
|---|
| 413 | |
|---|
| 414 | if ($first_char == '=') { |
|---|
| 415 | // key="value" |
|---|
| 416 | |
|---|
| 417 | $cursor++; |
|---|
| 418 | $cursor += strspn($string, $this->_whitespace, $cursor); |
|---|
| 419 | |
|---|
| 420 | if ($cursor === false) { |
|---|
| 421 | $array[$key] = ''; |
|---|
| 422 | break; |
|---|
| 423 | } |
|---|
| 424 | |
|---|
| 425 | // we might be in front of a quote right now |
|---|
| 426 | |
|---|
| 427 | $char = @$string[$cursor]; |
|---|
| 428 | |
|---|
| 429 | if ($char == '"' || $char == "'") { |
|---|
| 430 | // it's quoted, end bound is $char |
|---|
| 431 | $cursor++; |
|---|
| 432 | $value_begin = $cursor; |
|---|
| 433 | $cursor = strpos($string, $char, $cursor); |
|---|
| 434 | $value_end = $cursor; |
|---|
| 435 | } else { |
|---|
| 436 | // it's not quoted, end bound is whitespace |
|---|
| 437 | $value_begin = $cursor; |
|---|
| 438 | $cursor += strcspn($string, $this->_whitespace, $cursor); |
|---|
| 439 | $value_end = $cursor; |
|---|
| 440 | } |
|---|
| 441 | |
|---|
| 442 | // we reached a premature end |
|---|
| 443 | if ($cursor === false) { |
|---|
| 444 | $cursor = $size; |
|---|
| 445 | $value_end = $cursor; |
|---|
| 446 | } |
|---|
| 447 | |
|---|
| 448 | $value = substr($string, $value_begin, $value_end - $value_begin); |
|---|
| 449 | if ($value === false) $value = ''; |
|---|
| 450 | $array[$key] = $this->parseData($value); |
|---|
| 451 | $cursor++; |
|---|
| 452 | |
|---|
| 453 | } else { |
|---|
| 454 | // boolattr |
|---|
| 455 | if ($key !== '') { |
|---|
| 456 | $array[$key] = $key; |
|---|
| 457 | } else { |
|---|
| 458 | // purely theoretical |
|---|
| 459 | if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); |
|---|
| 460 | } |
|---|
| 461 | |
|---|
| 462 | } |
|---|
| 463 | } |
|---|
| 464 | return $array; |
|---|
| 465 | } |
|---|
| 466 | |
|---|
| 467 | } |
|---|
| 468 | |
|---|