Context Navigation

DirectLex.php

Revision 21, 18.9 kB (checked in by admin, 18 years ago)

Line
1	<?php
2
3	/**
4	* Our in-house implementation of a parser.
5	*
6	* A pure PHP parser, DirectLex has absolutely no dependencies, making
7	* it a reasonably good default for PHP4. Written with efficiency in mind,
8	* it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
9	* pales in comparison to HTMLPurifier_Lexer_DOMLex.
10	*
11	* @todo Reread XML spec and document differences.
12	*/
13	class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
14	{
15
16	/**
17	* Whitespace characters for str(c)spn.
18	*/
19	protected $_whitespace = "\x20\x09\x0D\x0A";
20
21	/**
22	* Callback function for script CDATA fudge
23	* @param $matches, in form of array(opening tag, contents, closing tag)
24	*/
25	protected function scriptCallback($matches) {
26	return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
27	}
28
29	public function tokenizeHTML($html, $config, $context) {
30
31	// special normalization for script tags without any armor
32	// our "armor" heurstic is a < sign any number of whitespaces after
33	// the first script tag
34	if ($config->get('HTML', 'Trusted')) {
35	$html = preg_replace_callback('#(<script[^>]>)(\s[^<].+?)(</script>)#si',
36	array($this, 'scriptCallback'), $html);
37	}
38
39	$html = $this->normalize($html, $config, $context);
40
41	$cursor = 0; // our location in the text
42	$inside_tag = false; // whether or not we're parsing the inside of a tag
43	$array = array(); // result array
44
45	$maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
46
47	if ($maintain_line_numbers === null) {
48	// automatically determine line numbering by checking
49	// if error collection is on
50	$maintain_line_numbers = $config->get('Core', 'CollectErrors');
51	}
52
53	if ($maintain_line_numbers) $current_line = 1;
54	else $current_line = false;
55	$context->register('CurrentLine', $current_line);
56	$nl = "\n";
57	// how often to manually recalculate. This will ALWAYS be right,
58	// but it's pretty wasteful. Set to 0 to turn off
59	$synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
60
61	$e = false;
62	if ($config->get('Core', 'CollectErrors')) {
63	$e =& $context->get('ErrorCollector');
64	}
65
66	// for testing synchronization
67	$loops = 0;
68
69	while(++$loops) {
70
71	// recalculate lines
72	if (
73	$maintain_line_numbers && // line number tracking is on
74	$synchronize_interval && // synchronization is on
75	$cursor > 0 && // cursor is further than zero
76	$loops % $synchronize_interval === 0 // time to synchronize!
77	) {
78	$current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
79	}
80
81	$position_next_lt = strpos($html, '<', $cursor);
82	$position_next_gt = strpos($html, '>', $cursor);
83
84	// triggers on "<b>asdf</b>" but not "asdf <b></b>"
85	// special case to set up context
86	if ($position_next_lt === $cursor) {
87	$inside_tag = true;
88	$cursor++;
89	}
90
91	if (!$inside_tag && $position_next_lt !== false) {
92	// We are not inside tag and there still is another tag to parse
93	$token = new
94	HTMLPurifier_Token_Text(
95	$this->parseData(
96	substr(
97	$html, $cursor, $position_next_lt - $cursor
98	)
99	)
100	);
101	if ($maintain_line_numbers) {
102	$token->line = $current_line;
103	$current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
104	}
105	$array[] = $token;
106	$cursor = $position_next_lt + 1;
107	$inside_tag = true;
108	continue;
109	} elseif (!$inside_tag) {
110	// We are not inside tag but there are no more tags
111	// If we're already at the end, break
112	if ($cursor === strlen($html)) break;
113	// Create Text of rest of string
114	$token = new
115	HTMLPurifier_Token_Text(
116	$this->parseData(
117	substr(
118	$html, $cursor
119	)
120	)
121	);
122	if ($maintain_line_numbers) $token->line = $current_line;
123	$array[] = $token;
124	break;
125	} elseif ($inside_tag && $position_next_gt !== false) {
126	// We are in tag and it is well formed
127	// Grab the internals of the tag
128	$strlen_segment = $position_next_gt - $cursor;
129
130	if ($strlen_segment < 1) {
131	// there's nothing to process!
132	$token = new HTMLPurifier_Token_Text('<');
133	$cursor++;
134	continue;
135	}
136
137	$segment = substr($html, $cursor, $strlen_segment);
138
139	if ($segment === false) {
140	// somehow, we attempted to access beyond the end of
141	// the string, defense-in-depth, reported by Nate Abele
142	break;
143	}
144
145	// Check if it's a comment
146	if (
147	substr($segment, 0, 3) === '!--'
148	) {
149	// re-determine segment length, looking for -->
150	$position_comment_end = strpos($html, '-->', $cursor);
151	if ($position_comment_end === false) {
152	// uh oh, we have a comment that extends to
153	// infinity. Can't be helped: set comment
154	// end position to end of string
155	if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
156	$position_comment_end = strlen($html);
157	$end = true;
158	} else {
159	$end = false;
160	}
161	$strlen_segment = $position_comment_end - $cursor;
162	$segment = substr($html, $cursor, $strlen_segment);
163	$token = new
164	HTMLPurifier_Token_Comment(
165	substr(
166	$segment, 3, $strlen_segment - 3
167	)
168	);
169	if ($maintain_line_numbers) {
170	$token->line = $current_line;
171	$current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
172	}
173	$array[] = $token;
174	$cursor = $end ? $position_comment_end : $position_comment_end + 3;
175	$inside_tag = false;
176	continue;
177	}
178
179	// Check if it's an end tag
180	$is_end_tag = (strpos($segment,'/') === 0);
181	if ($is_end_tag) {
182	$type = substr($segment, 1);
183	$token = new HTMLPurifier_Token_End($type);
184	if ($maintain_line_numbers) {
185	$token->line = $current_line;
186	$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
187	}
188	$array[] = $token;
189	$inside_tag = false;
190	$cursor = $position_next_gt + 1;
191	continue;
192	}
193
194	// Check leading character is alnum, if not, we may
195	// have accidently grabbed an emoticon. Translate into
196	// text and go our merry way
197	if (!ctype_alpha($segment[0])) {
198	// XML: $segment[0] !== '_' && $segment[0] !== ':'
199	if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
200	$token = new
201	HTMLPurifier_Token_Text(
202	'<' .
203	$this->parseData(
204	$segment
205	) .
206	'>'
207	);
208	if ($maintain_line_numbers) {
209	$token->line = $current_line;
210	$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
211	}
212	$array[] = $token;
213	$cursor = $position_next_gt + 1;
214	$inside_tag = false;
215	continue;
216	}
217
218	// Check if it is explicitly self closing, if so, remove
219	// trailing slash. Remember, we could have a tag like <br>, so
220	// any later token processing scripts must convert improperly
221	// classified EmptyTags from StartTags.
222	$is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
223	if ($is_self_closing) {
224	$strlen_segment--;
225	$segment = substr($segment, 0, $strlen_segment);
226	}
227
228	// Check if there are any attributes
229	$position_first_space = strcspn($segment, $this->_whitespace);
230
231	if ($position_first_space >= $strlen_segment) {
232	if ($is_self_closing) {
233	$token = new HTMLPurifier_Token_Empty($segment);
234	} else {
235	$token = new HTMLPurifier_Token_Start($segment);
236	}
237	if ($maintain_line_numbers) {
238	$token->line = $current_line;
239	$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
240	}
241	$array[] = $token;
242	$inside_tag = false;
243	$cursor = $position_next_gt + 1;
244	continue;
245	}
246
247	// Grab out all the data
248	$type = substr($segment, 0, $position_first_space);
249	$attribute_string =
250	trim(
251	substr(
252	$segment, $position_first_space
253	)
254	);
255	if ($attribute_string) {
256	$attr = $this->parseAttributeString(
257	$attribute_string
258	, $config, $context
259	);
260	} else {
261	$attr = array();
262	}
263
264	if ($is_self_closing) {
265	$token = new HTMLPurifier_Token_Empty($type, $attr);
266	} else {
267	$token = new HTMLPurifier_Token_Start($type, $attr);
268	}
269	if ($maintain_line_numbers) {
270	$token->line = $current_line;
271	$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
272	}
273	$array[] = $token;
274	$cursor = $position_next_gt + 1;
275	$inside_tag = false;
276	continue;
277	} else {
278	// inside tag, but there's no ending > sign
279	if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
280	$token = new
281	HTMLPurifier_Token_Text(
282	'<' .
283	$this->parseData(
284	substr($html, $cursor)
285	)
286	);
287	if ($maintain_line_numbers) $token->line = $current_line;
288	// no cursor scroll? Hmm...
289	$array[] = $token;
290	break;
291	}
292	break;
293	}
294
295	$context->destroy('CurrentLine');
296	return $array;
297	}
298
299	/**
300	* PHP 5.0.x compatible substr_count that implements offset and length
301	*/
302	protected function substrCount($haystack, $needle, $offset, $length) {
303	static $oldVersion;
304	if ($oldVersion === null) {
305	$oldVersion = version_compare(PHP_VERSION, '5.1', '<');
306	}
307	if ($oldVersion) {
308	$haystack = substr($haystack, $offset, $length);
309	return substr_count($haystack, $needle);
310	} else {
311	return substr_count($haystack, $needle, $offset, $length);
312	}
313	}
314
315	/**
316	* Takes the inside of an HTML tag and makes an assoc array of attributes.
317	*
318	* @param $string Inside of tag excluding name.
319	* @returns Assoc array of attributes.
320	*/
321	public function parseAttributeString($string, $config, $context) {
322	$string = (string) $string; // quick typecast
323
324	if ($string == '') return array(); // no attributes
325
326	$e = false;
327	if ($config->get('Core', 'CollectErrors')) {
328	$e =& $context->get('ErrorCollector');
329	}
330
331	// let's see if we can abort as quickly as possible
332	// one equal sign, no spaces => one attribute
333	$num_equal = substr_count($string, '=');
334	$has_space = strpos($string, ' ');
335	if ($num_equal === 0 && !$has_space) {
336	// bool attribute
337	return array($string => $string);
338	} elseif ($num_equal === 1 && !$has_space) {
339	// only one attribute
340	list($key, $quoted_value) = explode('=', $string);
341	$quoted_value = trim($quoted_value);
342	if (!$key) {
343	if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
344	return array();
345	}
346	if (!$quoted_value) return array($key => '');
347	$first_char = @$quoted_value[0];
348	$last_char = @$quoted_value[strlen($quoted_value)-1];
349
350	$same_quote = ($first_char == $last_char);
351	$open_quote = ($first_char == '"' \|\| $first_char == "'");
352
353	if ( $same_quote && $open_quote) {
354	// well behaved
355	$value = substr($quoted_value, 1, strlen($quoted_value) - 2);
356	} else {
357	// not well behaved
358	if ($open_quote) {
359	if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
360	$value = substr($quoted_value, 1);
361	} else {
362	$value = $quoted_value;
363	}
364	}
365	if ($value === false) $value = '';
366	return array($key => $value);
367	}
368
369	// setup loop environment
370	$array = array(); // return assoc array of attributes
371	$cursor = 0; // current position in string (moves forward)
372	$size = strlen($string); // size of the string (stays the same)
373
374	// if we have unquoted attributes, the parser expects a terminating
375	// space, so let's guarantee that there's always a terminating space.
376	$string .= ' ';
377
378	while(true) {
379
380	if ($cursor >= $size) {
381	break;
382	}
383
384	$cursor += ($value = strspn($string, $this->_whitespace, $cursor));
385	// grab the key
386
387	$key_begin = $cursor; //we're currently at the start of the key
388
389	// scroll past all characters that are the key (not whitespace or =)
390	$cursor += strcspn($string, $this->_whitespace . '=', $cursor);
391
392	$key_end = $cursor; // now at the end of the key
393
394	$key = substr($string, $key_begin, $key_end - $key_begin);
395
396	if (!$key) {
397	if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
398	$cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
399	continue; // empty key
400	}
401
402	// scroll past all whitespace
403	$cursor += strspn($string, $this->_whitespace, $cursor);
404
405	if ($cursor >= $size) {
406	$array[$key] = $key;
407	break;
408	}
409
410	// if the next character is an equal sign, we've got a regular
411	// pair, otherwise, it's a bool attribute
412	$first_char = @$string[$cursor];
413
414	if ($first_char == '=') {
415	// key="value"
416
417	$cursor++;
418	$cursor += strspn($string, $this->_whitespace, $cursor);
419
420	if ($cursor === false) {
421	$array[$key] = '';
422	break;
423	}
424
425	// we might be in front of a quote right now
426
427	$char = @$string[$cursor];
428
429	if ($char == '"' \|\| $char == "'") {
430	// it's quoted, end bound is $char
431	$cursor++;
432	$value_begin = $cursor;
433	$cursor = strpos($string, $char, $cursor);
434	$value_end = $cursor;
435	} else {
436	// it's not quoted, end bound is whitespace
437	$value_begin = $cursor;
438	$cursor += strcspn($string, $this->_whitespace, $cursor);
439	$value_end = $cursor;
440	}
441
442	// we reached a premature end
443	if ($cursor === false) {
444	$cursor = $size;
445	$value_end = $cursor;
446	}
447
448	$value = substr($string, $value_begin, $value_end - $value_begin);
449	if ($value === false) $value = '';
450	$array[$key] = $this->parseData($value);
451	$cursor++;
452
453	} else {
454	// boolattr
455	if ($key !== '') {
456	$array[$key] = $key;
457	} else {
458	// purely theoretical
459	if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
460	}
461
462	}
463	}
464	return $array;
465	}
466
467	}
468

Note: See TracBrowser for help on using the browser.

Context Navigation

root/afridex/plugins/Flutter/purifier_lib/HTMLPurifier/Lexer/DirectLex.php

Download in other formats: