| 1 | <?php |
|---|
| 2 | |
|---|
| 3 | /** |
|---|
| 4 | * Class that handles operations involving percent-encoding in URIs. |
|---|
| 5 | * |
|---|
| 6 | * @warning |
|---|
| 7 | * Be careful when reusing instances of PercentEncoder. The object |
|---|
| 8 | * you use for normalize() SHOULD NOT be used for encode(), or |
|---|
| 9 | * vice-versa. |
|---|
| 10 | */ |
|---|
| 11 | class HTMLPurifier_PercentEncoder |
|---|
| 12 | { |
|---|
| 13 | |
|---|
| 14 | /** |
|---|
| 15 | * Reserved characters to preserve when using encode(). |
|---|
| 16 | */ |
|---|
| 17 | protected $preserve = array(); |
|---|
| 18 | |
|---|
| 19 | /** |
|---|
| 20 | * String of characters that should be preserved while using encode(). |
|---|
| 21 | */ |
|---|
| 22 | public function __construct($preserve = false) { |
|---|
| 23 | // unreserved letters, ought to const-ify |
|---|
| 24 | for ($i = 48; $i <= 57; $i++) $this->preserve[$i] = true; // digits |
|---|
| 25 | for ($i = 65; $i <= 90; $i++) $this->preserve[$i] = true; // upper-case |
|---|
| 26 | for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case |
|---|
| 27 | $this->preserve[45] = true; // Dash - |
|---|
| 28 | $this->preserve[46] = true; // Period . |
|---|
| 29 | $this->preserve[95] = true; // Underscore _ |
|---|
| 30 | $this->preserve[126]= true; // Tilde ~ |
|---|
| 31 | |
|---|
| 32 | // extra letters not to escape |
|---|
| 33 | if ($preserve !== false) { |
|---|
| 34 | for ($i = 0, $c = strlen($preserve); $i < $c; $i++) { |
|---|
| 35 | $this->preserve[ord($preserve[$i])] = true; |
|---|
| 36 | } |
|---|
| 37 | } |
|---|
| 38 | } |
|---|
| 39 | |
|---|
| 40 | /** |
|---|
| 41 | * Our replacement for urlencode, it encodes all non-reserved characters, |
|---|
| 42 | * as well as any extra characters that were instructed to be preserved. |
|---|
| 43 | * @note |
|---|
| 44 | * Assumes that the string has already been normalized, making any |
|---|
| 45 | * and all percent escape sequences valid. Percents will not be |
|---|
| 46 | * re-escaped, regardless of their status in $preserve |
|---|
| 47 | * @param $string String to be encoded |
|---|
| 48 | * @return Encoded string. |
|---|
| 49 | */ |
|---|
| 50 | public function encode($string) { |
|---|
| 51 | $ret = ''; |
|---|
| 52 | for ($i = 0, $c = strlen($string); $i < $c; $i++) { |
|---|
| 53 | if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) { |
|---|
| 54 | $ret .= '%' . sprintf('%02X', $int); |
|---|
| 55 | } else { |
|---|
| 56 | $ret .= $string[$i]; |
|---|
| 57 | } |
|---|
| 58 | } |
|---|
| 59 | return $ret; |
|---|
| 60 | } |
|---|
| 61 | |
|---|
| 62 | /** |
|---|
| 63 | * Fix up percent-encoding by decoding unreserved characters and normalizing. |
|---|
| 64 | * @warning This function is affected by $preserve, even though the |
|---|
| 65 | * usual desired behavior is for this not to preserve those |
|---|
| 66 | * characters. Be careful when reusing instances of PercentEncoder! |
|---|
| 67 | * @param $string String to normalize |
|---|
| 68 | */ |
|---|
| 69 | public function normalize($string) { |
|---|
| 70 | if ($string == '') return ''; |
|---|
| 71 | $parts = explode('%', $string); |
|---|
| 72 | $ret = array_shift($parts); |
|---|
| 73 | foreach ($parts as $part) { |
|---|
| 74 | $length = strlen($part); |
|---|
| 75 | if ($length < 2) { |
|---|
| 76 | $ret .= '%25' . $part; |
|---|
| 77 | continue; |
|---|
| 78 | } |
|---|
| 79 | $encoding = substr($part, 0, 2); |
|---|
| 80 | $text = substr($part, 2); |
|---|
| 81 | if (!ctype_xdigit($encoding)) { |
|---|
| 82 | $ret .= '%25' . $part; |
|---|
| 83 | continue; |
|---|
| 84 | } |
|---|
| 85 | $int = hexdec($encoding); |
|---|
| 86 | if (isset($this->preserve[$int])) { |
|---|
| 87 | $ret .= chr($int) . $text; |
|---|
| 88 | continue; |
|---|
| 89 | } |
|---|
| 90 | $encoding = strtoupper($encoding); |
|---|
| 91 | $ret .= '%' . $encoding . $text; |
|---|
| 92 | } |
|---|
| 93 | return $ret; |
|---|
| 94 | } |
|---|
| 95 | |
|---|
| 96 | } |
|---|
| 97 | |
|---|