| 1 | <?php |
|---|
| 2 | |
|---|
| 3 | /** |
|---|
| 4 | * A UTF-8 specific character encoder that handles cleaning and transforming. |
|---|
| 5 | * @note All functions in this class should be static. |
|---|
| 6 | */ |
|---|
| 7 | class HTMLPurifier_Encoder |
|---|
| 8 | { |
|---|
| 9 | |
|---|
| 10 | /** |
|---|
| 11 | * Constructor throws fatal error if you attempt to instantiate class |
|---|
| 12 | */ |
|---|
| 13 | private function __construct() { |
|---|
| 14 | trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR); |
|---|
| 15 | } |
|---|
| 16 | |
|---|
| 17 | /** |
|---|
| 18 | * Error-handler that mutes errors, alternative to shut-up operator. |
|---|
| 19 | */ |
|---|
| 20 | private static function muteErrorHandler() {} |
|---|
| 21 | |
|---|
| 22 | /** |
|---|
| 23 | * Cleans a UTF-8 string for well-formedness and SGML validity |
|---|
| 24 | * |
|---|
| 25 | * It will parse according to UTF-8 and return a valid UTF8 string, with |
|---|
| 26 | * non-SGML codepoints excluded. |
|---|
| 27 | * |
|---|
| 28 | * @note Just for reference, the non-SGML code points are 0 to 31 and |
|---|
| 29 | * 127 to 159, inclusive. However, we allow code points 9, 10 |
|---|
| 30 | * and 13, which are the tab, line feed and carriage return |
|---|
| 31 | * respectively. 128 and above the code points map to multibyte |
|---|
| 32 | * UTF-8 representations. |
|---|
| 33 | * |
|---|
| 34 | * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and |
|---|
| 35 | * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the |
|---|
| 36 | * LGPL license. Notes on what changed are inside, but in general, |
|---|
| 37 | * the original code transformed UTF-8 text into an array of integer |
|---|
| 38 | * Unicode codepoints. Understandably, transforming that back to |
|---|
| 39 | * a string would be somewhat expensive, so the function was modded to |
|---|
| 40 | * directly operate on the string. However, this discourages code |
|---|
| 41 | * reuse, and the logic enumerated here would be useful for any |
|---|
| 42 | * function that needs to be able to understand UTF-8 characters. |
|---|
| 43 | * As of right now, only smart lossless character encoding converters |
|---|
| 44 | * would need that, and I'm probably not going to implement them. |
|---|
| 45 | * Once again, PHP 6 should solve all our problems. |
|---|
| 46 | */ |
|---|
| 47 | public static function cleanUTF8($str, $force_php = false) { |
|---|
| 48 | |
|---|
| 49 | static $non_sgml_chars = array(); |
|---|
| 50 | if (empty($non_sgml_chars)) { |
|---|
| 51 | for ($i = 0; $i <= 31; $i++) { |
|---|
| 52 | // non-SGML ASCII chars |
|---|
| 53 | // save \r, \t and \n |
|---|
| 54 | if ($i == 9 || $i == 13 || $i == 10) continue; |
|---|
| 55 | $non_sgml_chars[chr($i)] = ''; |
|---|
| 56 | } |
|---|
| 57 | for ($i = 127; $i <= 159; $i++) { |
|---|
| 58 | $non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = ''; |
|---|
| 59 | } |
|---|
| 60 | } |
|---|
| 61 | |
|---|
| 62 | static $iconv = null; |
|---|
| 63 | if ($iconv === null) $iconv = function_exists('iconv'); |
|---|
| 64 | |
|---|
| 65 | // UTF-8 validity is checked since PHP 4.3.5 |
|---|
| 66 | // This is an optimization: if the string is already valid UTF-8, no |
|---|
| 67 | // need to do iconv/php stuff. 99% of the time, this will be the case. |
|---|
| 68 | if (preg_match('/^.{1}/us', $str)) { |
|---|
| 69 | return strtr($str, $non_sgml_chars); |
|---|
| 70 | } |
|---|
| 71 | |
|---|
| 72 | if ($iconv && !$force_php) { |
|---|
| 73 | // do the shortcut way |
|---|
| 74 | set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); |
|---|
| 75 | $str = iconv('UTF-8', 'UTF-8//IGNORE', $str); |
|---|
| 76 | restore_error_handler(); |
|---|
| 77 | return strtr($str, $non_sgml_chars); |
|---|
| 78 | } |
|---|
| 79 | |
|---|
| 80 | $mState = 0; // cached expected number of octets after the current octet |
|---|
| 81 | // until the beginning of the next UTF8 character sequence |
|---|
| 82 | $mUcs4 = 0; // cached Unicode character |
|---|
| 83 | $mBytes = 1; // cached expected number of octets in the current sequence |
|---|
| 84 | |
|---|
| 85 | // original code involved an $out that was an array of Unicode |
|---|
| 86 | // codepoints. Instead of having to convert back into UTF-8, we've |
|---|
| 87 | // decided to directly append valid UTF-8 characters onto a string |
|---|
| 88 | // $out once they're done. $char accumulates raw bytes, while $mUcs4 |
|---|
| 89 | // turns into the Unicode code point, so there's some redundancy. |
|---|
| 90 | |
|---|
| 91 | $out = ''; |
|---|
| 92 | $char = ''; |
|---|
| 93 | |
|---|
| 94 | $len = strlen($str); |
|---|
| 95 | for($i = 0; $i < $len; $i++) { |
|---|
| 96 | $in = ord($str{$i}); |
|---|
| 97 | $char .= $str[$i]; // append byte to char |
|---|
| 98 | if (0 == $mState) { |
|---|
| 99 | // When mState is zero we expect either a US-ASCII character |
|---|
| 100 | // or a multi-octet sequence. |
|---|
| 101 | if (0 == (0x80 & ($in))) { |
|---|
| 102 | // US-ASCII, pass straight through. |
|---|
| 103 | if (($in <= 31 || $in == 127) && |
|---|
| 104 | !($in == 9 || $in == 13 || $in == 10) // save \r\t\n |
|---|
| 105 | ) { |
|---|
| 106 | // control characters, remove |
|---|
| 107 | } else { |
|---|
| 108 | $out .= $char; |
|---|
| 109 | } |
|---|
| 110 | // reset |
|---|
| 111 | $char = ''; |
|---|
| 112 | $mBytes = 1; |
|---|
| 113 | } elseif (0xC0 == (0xE0 & ($in))) { |
|---|
| 114 | // First octet of 2 octet sequence |
|---|
| 115 | $mUcs4 = ($in); |
|---|
| 116 | $mUcs4 = ($mUcs4 & 0x1F) << 6; |
|---|
| 117 | $mState = 1; |
|---|
| 118 | $mBytes = 2; |
|---|
| 119 | } elseif (0xE0 == (0xF0 & ($in))) { |
|---|
| 120 | // First octet of 3 octet sequence |
|---|
| 121 | $mUcs4 = ($in); |
|---|
| 122 | $mUcs4 = ($mUcs4 & 0x0F) << 12; |
|---|
| 123 | $mState = 2; |
|---|
| 124 | $mBytes = 3; |
|---|
| 125 | } elseif (0xF0 == (0xF8 & ($in))) { |
|---|
| 126 | // First octet of 4 octet sequence |
|---|
| 127 | $mUcs4 = ($in); |
|---|
| 128 | $mUcs4 = ($mUcs4 & 0x07) << 18; |
|---|
| 129 | $mState = 3; |
|---|
| 130 | $mBytes = 4; |
|---|
| 131 | } elseif (0xF8 == (0xFC & ($in))) { |
|---|
| 132 | // First octet of 5 octet sequence. |
|---|
| 133 | // |
|---|
| 134 | // This is illegal because the encoded codepoint must be |
|---|
| 135 | // either: |
|---|
| 136 | // (a) not the shortest form or |
|---|
| 137 | // (b) outside the Unicode range of 0-0x10FFFF. |
|---|
| 138 | // Rather than trying to resynchronize, we will carry on |
|---|
| 139 | // until the end of the sequence and let the later error |
|---|
| 140 | // handling code catch it. |
|---|
| 141 | $mUcs4 = ($in); |
|---|
| 142 | $mUcs4 = ($mUcs4 & 0x03) << 24; |
|---|
| 143 | $mState = 4; |
|---|
| 144 | $mBytes = 5; |
|---|
| 145 | } elseif (0xFC == (0xFE & ($in))) { |
|---|
| 146 | // First octet of 6 octet sequence, see comments for 5 |
|---|
| 147 | // octet sequence. |
|---|
| 148 | $mUcs4 = ($in); |
|---|
| 149 | $mUcs4 = ($mUcs4 & 1) << 30; |
|---|
| 150 | $mState = 5; |
|---|
| 151 | $mBytes = 6; |
|---|
| 152 | } else { |
|---|
| 153 | // Current octet is neither in the US-ASCII range nor a |
|---|
| 154 | // legal first octet of a multi-octet sequence. |
|---|
| 155 | $mState = 0; |
|---|
| 156 | $mUcs4 = 0; |
|---|
| 157 | $mBytes = 1; |
|---|
| 158 | $char = ''; |
|---|
| 159 | } |
|---|
| 160 | } else { |
|---|
| 161 | // When mState is non-zero, we expect a continuation of the |
|---|
| 162 | // multi-octet sequence |
|---|
| 163 | if (0x80 == (0xC0 & ($in))) { |
|---|
| 164 | // Legal continuation. |
|---|
| 165 | $shift = ($mState - 1) * 6; |
|---|
| 166 | $tmp = $in; |
|---|
| 167 | $tmp = ($tmp & 0x0000003F) << $shift; |
|---|
| 168 | $mUcs4 |= $tmp; |
|---|
| 169 | |
|---|
| 170 | if (0 == --$mState) { |
|---|
| 171 | // End of the multi-octet sequence. mUcs4 now contains |
|---|
| 172 | // the final Unicode codepoint to be output |
|---|
| 173 | |
|---|
| 174 | // Check for illegal sequences and codepoints. |
|---|
| 175 | |
|---|
| 176 | // From Unicode 3.1, non-shortest form is illegal |
|---|
| 177 | if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || |
|---|
| 178 | ((3 == $mBytes) && ($mUcs4 < 0x0800)) || |
|---|
| 179 | ((4 == $mBytes) && ($mUcs4 < 0x10000)) || |
|---|
| 180 | (4 < $mBytes) || |
|---|
| 181 | // From Unicode 3.2, surrogate characters = illegal |
|---|
| 182 | (($mUcs4 & 0xFFFFF800) == 0xD800) || |
|---|
| 183 | // Codepoints outside the Unicode range are illegal |
|---|
| 184 | ($mUcs4 > 0x10FFFF) |
|---|
| 185 | ) { |
|---|
| 186 | |
|---|
| 187 | } elseif (0xFEFF != $mUcs4 && // omit BOM |
|---|
| 188 | !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML |
|---|
| 189 | ) { |
|---|
| 190 | $out .= $char; |
|---|
| 191 | } |
|---|
| 192 | // initialize UTF8 cache (reset) |
|---|
| 193 | $mState = 0; |
|---|
| 194 | $mUcs4 = 0; |
|---|
| 195 | $mBytes = 1; |
|---|
| 196 | $char = ''; |
|---|
| 197 | } |
|---|
| 198 | } else { |
|---|
| 199 | // ((0xC0 & (*in) != 0x80) && (mState != 0)) |
|---|
| 200 | // Incomplete multi-octet sequence. |
|---|
| 201 | // used to result in complete fail, but we'll reset |
|---|
| 202 | $mState = 0; |
|---|
| 203 | $mUcs4 = 0; |
|---|
| 204 | $mBytes = 1; |
|---|
| 205 | $char =''; |
|---|
| 206 | } |
|---|
| 207 | } |
|---|
| 208 | } |
|---|
| 209 | return $out; |
|---|
| 210 | } |
|---|
| 211 | |
|---|
| 212 | /** |
|---|
| 213 | * Translates a Unicode codepoint into its corresponding UTF-8 character. |
|---|
| 214 | * @note Based on Feyd's function at |
|---|
| 215 | * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>, |
|---|
| 216 | * which is in public domain. |
|---|
| 217 | * @note While we're going to do code point parsing anyway, a good |
|---|
| 218 | * optimization would be to refuse to translate code points that |
|---|
| 219 | * are non-SGML characters. However, this could lead to duplication. |
|---|
| 220 | * @note This is very similar to the unichr function in |
|---|
| 221 | * maintenance/generate-entity-file.php (although this is superior, |
|---|
| 222 | * due to its sanity checks). |
|---|
| 223 | */ |
|---|
| 224 | |
|---|
| 225 | // +----------+----------+----------+----------+ |
|---|
| 226 | // | 33222222 | 22221111 | 111111 | | |
|---|
| 227 | // | 10987654 | 32109876 | 54321098 | 76543210 | bit |
|---|
| 228 | // +----------+----------+----------+----------+ |
|---|
| 229 | // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F |
|---|
| 230 | // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF |
|---|
| 231 | // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF |
|---|
| 232 | // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF |
|---|
| 233 | // +----------+----------+----------+----------+ |
|---|
| 234 | // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF) |
|---|
| 235 | // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes |
|---|
| 236 | // +----------+----------+----------+----------+ |
|---|
| 237 | |
|---|
| 238 | public static function unichr($code) { |
|---|
| 239 | if($code > 1114111 or $code < 0 or |
|---|
| 240 | ($code >= 55296 and $code <= 57343) ) { |
|---|
| 241 | // bits are set outside the "valid" range as defined |
|---|
| 242 | // by UNICODE 4.1.0 |
|---|
| 243 | return ''; |
|---|
| 244 | } |
|---|
| 245 | |
|---|
| 246 | $x = $y = $z = $w = 0; |
|---|
| 247 | if ($code < 128) { |
|---|
| 248 | // regular ASCII character |
|---|
| 249 | $x = $code; |
|---|
| 250 | } else { |
|---|
| 251 | // set up bits for UTF-8 |
|---|
| 252 | $x = ($code & 63) | 128; |
|---|
| 253 | if ($code < 2048) { |
|---|
| 254 | $y = (($code & 2047) >> 6) | 192; |
|---|
| 255 | } else { |
|---|
| 256 | $y = (($code & 4032) >> 6) | 128; |
|---|
| 257 | if($code < 65536) { |
|---|
| 258 | $z = (($code >> 12) & 15) | 224; |
|---|
| 259 | } else { |
|---|
| 260 | $z = (($code >> 12) & 63) | 128; |
|---|
| 261 | $w = (($code >> 18) & 7) | 240; |
|---|
| 262 | } |
|---|
| 263 | } |
|---|
| 264 | } |
|---|
| 265 | // set up the actual character |
|---|
| 266 | $ret = ''; |
|---|
| 267 | if($w) $ret .= chr($w); |
|---|
| 268 | if($z) $ret .= chr($z); |
|---|
| 269 | if($y) $ret .= chr($y); |
|---|
| 270 | $ret .= chr($x); |
|---|
| 271 | |
|---|
| 272 | return $ret; |
|---|
| 273 | } |
|---|
| 274 | |
|---|
| 275 | /** |
|---|
| 276 | * Converts a string to UTF-8 based on configuration. |
|---|
| 277 | */ |
|---|
| 278 | public static function convertToUTF8($str, $config, $context) { |
|---|
| 279 | static $iconv = null; |
|---|
| 280 | if ($iconv === null) $iconv = function_exists('iconv'); |
|---|
| 281 | $encoding = $config->get('Core', 'Encoding'); |
|---|
| 282 | if ($encoding === 'utf-8') return $str; |
|---|
| 283 | if ($iconv && !$config->get('Test', 'ForceNoIconv')) { |
|---|
| 284 | set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); |
|---|
| 285 | $str = iconv($encoding, 'utf-8//IGNORE', $str); |
|---|
| 286 | restore_error_handler(); |
|---|
| 287 | return $str; |
|---|
| 288 | } elseif ($encoding === 'iso-8859-1') { |
|---|
| 289 | set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); |
|---|
| 290 | $str = utf8_encode($str); |
|---|
| 291 | restore_error_handler(); |
|---|
| 292 | return $str; |
|---|
| 293 | } |
|---|
| 294 | trigger_error('Encoding not supported', E_USER_ERROR); |
|---|
| 295 | } |
|---|
| 296 | |
|---|
| 297 | /** |
|---|
| 298 | * Converts a string from UTF-8 based on configuration. |
|---|
| 299 | * @note Currently, this is a lossy conversion, with unexpressable |
|---|
| 300 | * characters being omitted. |
|---|
| 301 | */ |
|---|
| 302 | public static function convertFromUTF8($str, $config, $context) { |
|---|
| 303 | static $iconv = null; |
|---|
| 304 | if ($iconv === null) $iconv = function_exists('iconv'); |
|---|
| 305 | $encoding = $config->get('Core', 'Encoding'); |
|---|
| 306 | if ($encoding === 'utf-8') return $str; |
|---|
| 307 | if ($config->get('Core', 'EscapeNonASCIICharacters')) { |
|---|
| 308 | $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str); |
|---|
| 309 | } |
|---|
| 310 | if ($iconv && !$config->get('Test', 'ForceNoIconv')) { |
|---|
| 311 | set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); |
|---|
| 312 | $str = iconv('utf-8', $encoding . '//IGNORE', $str); |
|---|
| 313 | restore_error_handler(); |
|---|
| 314 | return $str; |
|---|
| 315 | } elseif ($encoding === 'iso-8859-1') { |
|---|
| 316 | set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); |
|---|
| 317 | $str = utf8_decode($str); |
|---|
| 318 | restore_error_handler(); |
|---|
| 319 | return $str; |
|---|
| 320 | } |
|---|
| 321 | trigger_error('Encoding not supported', E_USER_ERROR); |
|---|
| 322 | } |
|---|
| 323 | |
|---|
| 324 | /** |
|---|
| 325 | * Lossless (character-wise) conversion of HTML to ASCII |
|---|
| 326 | * @param $str UTF-8 string to be converted to ASCII |
|---|
| 327 | * @returns ASCII encoded string with non-ASCII character entity-ized |
|---|
| 328 | * @warning Adapted from MediaWiki, claiming fair use: this is a common |
|---|
| 329 | * algorithm. If you disagree with this license fudgery, |
|---|
| 330 | * implement it yourself. |
|---|
| 331 | * @note Uses decimal numeric entities since they are best supported. |
|---|
| 332 | * @note This is a DUMB function: it has no concept of keeping |
|---|
| 333 | * character entities that the projected character encoding |
|---|
| 334 | * can allow. We could possibly implement a smart version |
|---|
| 335 | * but that would require it to also know which Unicode |
|---|
| 336 | * codepoints the charset supported (not an easy task). |
|---|
| 337 | * @note Sort of with cleanUTF8() but it assumes that $str is |
|---|
| 338 | * well-formed UTF-8 |
|---|
| 339 | */ |
|---|
| 340 | public static function convertToASCIIDumbLossless($str) { |
|---|
| 341 | $bytesleft = 0; |
|---|
| 342 | $result = ''; |
|---|
| 343 | $working = 0; |
|---|
| 344 | $len = strlen($str); |
|---|
| 345 | for( $i = 0; $i < $len; $i++ ) { |
|---|
| 346 | $bytevalue = ord( $str[$i] ); |
|---|
| 347 | if( $bytevalue <= 0x7F ) { //0xxx xxxx |
|---|
| 348 | $result .= chr( $bytevalue ); |
|---|
| 349 | $bytesleft = 0; |
|---|
| 350 | } elseif( $bytevalue <= 0xBF ) { //10xx xxxx |
|---|
| 351 | $working = $working << 6; |
|---|
| 352 | $working += ($bytevalue & 0x3F); |
|---|
| 353 | $bytesleft--; |
|---|
| 354 | if( $bytesleft <= 0 ) { |
|---|
| 355 | $result .= "&#" . $working . ";"; |
|---|
| 356 | } |
|---|
| 357 | } elseif( $bytevalue <= 0xDF ) { //110x xxxx |
|---|
| 358 | $working = $bytevalue & 0x1F; |
|---|
| 359 | $bytesleft = 1; |
|---|
| 360 | } elseif( $bytevalue <= 0xEF ) { //1110 xxxx |
|---|
| 361 | $working = $bytevalue & 0x0F; |
|---|
| 362 | $bytesleft = 2; |
|---|
| 363 | } else { //1111 0xxx |
|---|
| 364 | $working = $bytevalue & 0x07; |
|---|
| 365 | $bytesleft = 3; |
|---|
| 366 | } |
|---|
| 367 | } |
|---|
| 368 | return $result; |
|---|
| 369 | } |
|---|
| 370 | |
|---|
| 371 | |
|---|
| 372 | } |
|---|
| 373 | |
|---|