Context Navigation

Encoder.php @ 23

Revision 21, 16.0 kB (checked in by admin, 18 years ago)

Line
1	<?php
2
3	/**
4	* A UTF-8 specific character encoder that handles cleaning and transforming.
5	* @note All functions in this class should be static.
6	*/
7	class HTMLPurifier_Encoder
8	{
9
10	/**
11	* Constructor throws fatal error if you attempt to instantiate class
12	*/
13	private function __construct() {
14	trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
15	}
16
17	/**
18	* Error-handler that mutes errors, alternative to shut-up operator.
19	*/
20	private static function muteErrorHandler() {}
21
22	/**
23	* Cleans a UTF-8 string for well-formedness and SGML validity
24	*
25	* It will parse according to UTF-8 and return a valid UTF8 string, with
26	* non-SGML codepoints excluded.
27	*
28	* @note Just for reference, the non-SGML code points are 0 to 31 and
29	* 127 to 159, inclusive. However, we allow code points 9, 10
30	* and 13, which are the tab, line feed and carriage return
31	* respectively. 128 and above the code points map to multibyte
32	* UTF-8 representations.
33	*
34	* @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
35	* hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
36	* LGPL license. Notes on what changed are inside, but in general,
37	* the original code transformed UTF-8 text into an array of integer
38	* Unicode codepoints. Understandably, transforming that back to
39	* a string would be somewhat expensive, so the function was modded to
40	* directly operate on the string. However, this discourages code
41	* reuse, and the logic enumerated here would be useful for any
42	* function that needs to be able to understand UTF-8 characters.
43	* As of right now, only smart lossless character encoding converters
44	* would need that, and I'm probably not going to implement them.
45	* Once again, PHP 6 should solve all our problems.
46	*/
47	public static function cleanUTF8($str, $force_php = false) {
48
49	static $non_sgml_chars = array();
50	if (empty($non_sgml_chars)) {
51	for ($i = 0; $i <= 31; $i++) {
52	// non-SGML ASCII chars
53	// save \r, \t and \n
54	if ($i == 9 \|\| $i == 13 \|\| $i == 10) continue;
55	$non_sgml_chars[chr($i)] = '';
56	}
57	for ($i = 127; $i <= 159; $i++) {
58	$non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
59	}
60	}
61
62	static $iconv = null;
63	if ($iconv === null) $iconv = function_exists('iconv');
64
65	// UTF-8 validity is checked since PHP 4.3.5
66	// This is an optimization: if the string is already valid UTF-8, no
67	// need to do iconv/php stuff. 99% of the time, this will be the case.
68	if (preg_match('/^.{1}/us', $str)) {
69	return strtr($str, $non_sgml_chars);
70	}
71
72	if ($iconv && !$force_php) {
73	// do the shortcut way
74	set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
75	$str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
76	restore_error_handler();
77	return strtr($str, $non_sgml_chars);
78	}
79
80	$mState = 0; // cached expected number of octets after the current octet
81	// until the beginning of the next UTF8 character sequence
82	$mUcs4 = 0; // cached Unicode character
83	$mBytes = 1; // cached expected number of octets in the current sequence
84
85	// original code involved an $out that was an array of Unicode
86	// codepoints. Instead of having to convert back into UTF-8, we've
87	// decided to directly append valid UTF-8 characters onto a string
88	// $out once they're done. $char accumulates raw bytes, while $mUcs4
89	// turns into the Unicode code point, so there's some redundancy.
90
91	$out = '';
92	$char = '';
93
94	$len = strlen($str);
95	for($i = 0; $i < $len; $i++) {
96	$in = ord($str{$i});
97	$char .= $str[$i]; // append byte to char
98	if (0 == $mState) {
99	// When mState is zero we expect either a US-ASCII character
100	// or a multi-octet sequence.
101	if (0 == (0x80 & ($in))) {
102	// US-ASCII, pass straight through.
103	if (($in <= 31 \|\| $in == 127) &&
104	!($in == 9 \|\| $in == 13 \|\| $in == 10) // save \r\t\n
105	) {
106	// control characters, remove
107	} else {
108	$out .= $char;
109	}
110	// reset
111	$char = '';
112	$mBytes = 1;
113	} elseif (0xC0 == (0xE0 & ($in))) {
114	// First octet of 2 octet sequence
115	$mUcs4 = ($in);
116	$mUcs4 = ($mUcs4 & 0x1F) << 6;
117	$mState = 1;
118	$mBytes = 2;
119	} elseif (0xE0 == (0xF0 & ($in))) {
120	// First octet of 3 octet sequence
121	$mUcs4 = ($in);
122	$mUcs4 = ($mUcs4 & 0x0F) << 12;
123	$mState = 2;
124	$mBytes = 3;
125	} elseif (0xF0 == (0xF8 & ($in))) {
126	// First octet of 4 octet sequence
127	$mUcs4 = ($in);
128	$mUcs4 = ($mUcs4 & 0x07) << 18;
129	$mState = 3;
130	$mBytes = 4;
131	} elseif (0xF8 == (0xFC & ($in))) {
132	// First octet of 5 octet sequence.
133	//
134	// This is illegal because the encoded codepoint must be
135	// either:
136	// (a) not the shortest form or
137	// (b) outside the Unicode range of 0-0x10FFFF.
138	// Rather than trying to resynchronize, we will carry on
139	// until the end of the sequence and let the later error
140	// handling code catch it.
141	$mUcs4 = ($in);
142	$mUcs4 = ($mUcs4 & 0x03) << 24;
143	$mState = 4;
144	$mBytes = 5;
145	} elseif (0xFC == (0xFE & ($in))) {
146	// First octet of 6 octet sequence, see comments for 5
147	// octet sequence.
148	$mUcs4 = ($in);
149	$mUcs4 = ($mUcs4 & 1) << 30;
150	$mState = 5;
151	$mBytes = 6;
152	} else {
153	// Current octet is neither in the US-ASCII range nor a
154	// legal first octet of a multi-octet sequence.
155	$mState = 0;
156	$mUcs4 = 0;
157	$mBytes = 1;
158	$char = '';
159	}
160	} else {
161	// When mState is non-zero, we expect a continuation of the
162	// multi-octet sequence
163	if (0x80 == (0xC0 & ($in))) {
164	// Legal continuation.
165	$shift = ($mState - 1) * 6;
166	$tmp = $in;
167	$tmp = ($tmp & 0x0000003F) << $shift;
168	$mUcs4 \|= $tmp;
169
170	if (0 == --$mState) {
171	// End of the multi-octet sequence. mUcs4 now contains
172	// the final Unicode codepoint to be output
173
174	// Check for illegal sequences and codepoints.
175
176	// From Unicode 3.1, non-shortest form is illegal
177	if (((2 == $mBytes) && ($mUcs4 < 0x0080)) \|\|
178	((3 == $mBytes) && ($mUcs4 < 0x0800)) \|\|
179	((4 == $mBytes) && ($mUcs4 < 0x10000)) \|\|
180	(4 < $mBytes) \|\|
181	// From Unicode 3.2, surrogate characters = illegal
182	(($mUcs4 & 0xFFFFF800) == 0xD800) \|\|
183	// Codepoints outside the Unicode range are illegal
184	($mUcs4 > 0x10FFFF)
185	) {
186
187	} elseif (0xFEFF != $mUcs4 && // omit BOM
188	!($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
189	) {
190	$out .= $char;
191	}
192	// initialize UTF8 cache (reset)
193	$mState = 0;
194	$mUcs4 = 0;
195	$mBytes = 1;
196	$char = '';
197	}
198	} else {
199	// ((0xC0 & (*in) != 0x80) && (mState != 0))
200	// Incomplete multi-octet sequence.
201	// used to result in complete fail, but we'll reset
202	$mState = 0;
203	$mUcs4 = 0;
204	$mBytes = 1;
205	$char ='';
206	}
207	}
208	}
209	return $out;
210	}
211
212	/**
213	* Translates a Unicode codepoint into its corresponding UTF-8 character.
214	* @note Based on Feyd's function at
215	* <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
216	* which is in public domain.
217	* @note While we're going to do code point parsing anyway, a good
218	* optimization would be to refuse to translate code points that
219	* are non-SGML characters. However, this could lead to duplication.
220	* @note This is very similar to the unichr function in
221	* maintenance/generate-entity-file.php (although this is superior,
222	* due to its sanity checks).
223	*/
224
225	// +----------+----------+----------+----------+
226	// \| 33222222 \| 22221111 \| 111111 \| \|
227	// \| 10987654 \| 32109876 \| 54321098 \| 76543210 \| bit
228	// +----------+----------+----------+----------+
229	// \| \| \| \| 0xxxxxxx \| 1 byte 0x00000000..0x0000007F
230	// \| \| \| 110yyyyy \| 10xxxxxx \| 2 byte 0x00000080..0x000007FF
231	// \| \| 1110zzzz \| 10yyyyyy \| 10xxxxxx \| 3 byte 0x00000800..0x0000FFFF
232	// \| 11110www \| 10wwzzzz \| 10yyyyyy \| 10xxxxxx \| 4 byte 0x00010000..0x0010FFFF
233	// +----------+----------+----------+----------+
234	// \| 00000000 \| 00011111 \| 11111111 \| 11111111 \| Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
235	// \| 00000000 \| 00010000 \| 11111111 \| 11111111 \| Defined upper limit of legal scalar codes
236	// +----------+----------+----------+----------+
237
238	public static function unichr($code) {
239	if($code > 1114111 or $code < 0 or
240	($code >= 55296 and $code <= 57343) ) {
241	// bits are set outside the "valid" range as defined
242	// by UNICODE 4.1.0
243	return '';
244	}
245
246	$x = $y = $z = $w = 0;
247	if ($code < 128) {
248	// regular ASCII character
249	$x = $code;
250	} else {
251	// set up bits for UTF-8
252	$x = ($code & 63) \| 128;
253	if ($code < 2048) {
254	$y = (($code & 2047) >> 6) \| 192;
255	} else {
256	$y = (($code & 4032) >> 6) \| 128;
257	if($code < 65536) {
258	$z = (($code >> 12) & 15) \| 224;
259	} else {
260	$z = (($code >> 12) & 63) \| 128;
261	$w = (($code >> 18) & 7) \| 240;
262	}
263	}
264	}
265	// set up the actual character
266	$ret = '';
267	if($w) $ret .= chr($w);
268	if($z) $ret .= chr($z);
269	if($y) $ret .= chr($y);
270	$ret .= chr($x);
271
272	return $ret;
273	}
274
275	/**
276	* Converts a string to UTF-8 based on configuration.
277	*/
278	public static function convertToUTF8($str, $config, $context) {
279	static $iconv = null;
280	if ($iconv === null) $iconv = function_exists('iconv');
281	$encoding = $config->get('Core', 'Encoding');
282	if ($encoding === 'utf-8') return $str;
283	if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
284	set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
285	$str = iconv($encoding, 'utf-8//IGNORE', $str);
286	restore_error_handler();
287	return $str;
288	} elseif ($encoding === 'iso-8859-1') {
289	set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
290	$str = utf8_encode($str);
291	restore_error_handler();
292	return $str;
293	}
294	trigger_error('Encoding not supported', E_USER_ERROR);
295	}
296
297	/**
298	* Converts a string from UTF-8 based on configuration.
299	* @note Currently, this is a lossy conversion, with unexpressable
300	* characters being omitted.
301	*/
302	public static function convertFromUTF8($str, $config, $context) {
303	static $iconv = null;
304	if ($iconv === null) $iconv = function_exists('iconv');
305	$encoding = $config->get('Core', 'Encoding');
306	if ($encoding === 'utf-8') return $str;
307	if ($config->get('Core', 'EscapeNonASCIICharacters')) {
308	$str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
309	}
310	if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
311	set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
312	$str = iconv('utf-8', $encoding . '//IGNORE', $str);
313	restore_error_handler();
314	return $str;
315	} elseif ($encoding === 'iso-8859-1') {
316	set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
317	$str = utf8_decode($str);
318	restore_error_handler();
319	return $str;
320	}
321	trigger_error('Encoding not supported', E_USER_ERROR);
322	}
323
324	/**
325	* Lossless (character-wise) conversion of HTML to ASCII
326	* @param $str UTF-8 string to be converted to ASCII
327	* @returns ASCII encoded string with non-ASCII character entity-ized
328	* @warning Adapted from MediaWiki, claiming fair use: this is a common
329	* algorithm. If you disagree with this license fudgery,
330	* implement it yourself.
331	* @note Uses decimal numeric entities since they are best supported.
332	* @note This is a DUMB function: it has no concept of keeping
333	* character entities that the projected character encoding
334	* can allow. We could possibly implement a smart version
335	* but that would require it to also know which Unicode
336	* codepoints the charset supported (not an easy task).
337	* @note Sort of with cleanUTF8() but it assumes that $str is
338	* well-formed UTF-8
339	*/
340	public static function convertToASCIIDumbLossless($str) {
341	$bytesleft = 0;
342	$result = '';
343	$working = 0;
344	$len = strlen($str);
345	for( $i = 0; $i < $len; $i++ ) {
346	$bytevalue = ord( $str[$i] );
347	if( $bytevalue <= 0x7F ) { //0xxx xxxx
348	$result .= chr( $bytevalue );
349	$bytesleft = 0;
350	} elseif( $bytevalue <= 0xBF ) { //10xx xxxx
351	$working = $working << 6;
352	$working += ($bytevalue & 0x3F);
353	$bytesleft--;
354	if( $bytesleft <= 0 ) {
355	$result .= "&#" . $working . ";";
356	}
357	} elseif( $bytevalue <= 0xDF ) { //110x xxxx
358	$working = $bytevalue & 0x1F;
359	$bytesleft = 1;
360	} elseif( $bytevalue <= 0xEF ) { //1110 xxxx
361	$working = $bytevalue & 0x0F;
362	$bytesleft = 2;
363	} else { //1111 0xxx
364	$working = $bytevalue & 0x07;
365	$bytesleft = 3;
366	}
367	}
368	return $result;
369	}
370
371
372	}
373

Note: See TracBrowser for help on using the browser.

Context Navigation

root/afridex/plugins/Flutter/purifier_lib/HTMLPurifier/Encoder.php @ 23

Download in other formats: