| [21] | 1 | <?php |
|---|
| 2 | |
|---|
| 3 | /** |
|---|
| 4 | * Parses a URI into the components and fragment identifier as specified |
|---|
| 5 | * by RFC 3986. |
|---|
| 6 | */ |
|---|
| 7 | class HTMLPurifier_URIParser |
|---|
| 8 | { |
|---|
| 9 | |
|---|
| 10 | /** |
|---|
| 11 | * Instance of HTMLPurifier_PercentEncoder to do normalization with. |
|---|
| 12 | */ |
|---|
| 13 | protected $percentEncoder; |
|---|
| 14 | |
|---|
| 15 | public function __construct() { |
|---|
| 16 | $this->percentEncoder = new HTMLPurifier_PercentEncoder(); |
|---|
| 17 | } |
|---|
| 18 | |
|---|
| 19 | /** |
|---|
| 20 | * Parses a URI. |
|---|
| 21 | * @param $uri string URI to parse |
|---|
| 22 | * @return HTMLPurifier_URI representation of URI. This representation has |
|---|
| 23 | * not been validated yet and may not conform to RFC. |
|---|
| 24 | */ |
|---|
| 25 | public function parse($uri) { |
|---|
| 26 | |
|---|
| 27 | $uri = $this->percentEncoder->normalize($uri); |
|---|
| 28 | |
|---|
| 29 | // Regexp is as per Appendix B. |
|---|
| 30 | // Note that ["<>] are an addition to the RFC's recommended |
|---|
| 31 | // characters, because they represent external delimeters. |
|---|
| 32 | $r_URI = '!'. |
|---|
| 33 | '(([^:/?#"<>]+):)?'. // 2. Scheme |
|---|
| 34 | '(//([^/?#"<>]*))?'. // 4. Authority |
|---|
| 35 | '([^?#"<>]*)'. // 5. Path |
|---|
| 36 | '(\?([^#"<>]*))?'. // 7. Query |
|---|
| 37 | '(#([^"<>]*))?'. // 8. Fragment |
|---|
| 38 | '!'; |
|---|
| 39 | |
|---|
| 40 | $matches = array(); |
|---|
| 41 | $result = preg_match($r_URI, $uri, $matches); |
|---|
| 42 | |
|---|
| 43 | if (!$result) return false; // *really* invalid URI |
|---|
| 44 | |
|---|
| 45 | // seperate out parts |
|---|
| 46 | $scheme = !empty($matches[1]) ? $matches[2] : null; |
|---|
| 47 | $authority = !empty($matches[3]) ? $matches[4] : null; |
|---|
| 48 | $path = $matches[5]; // always present, can be empty |
|---|
| 49 | $query = !empty($matches[6]) ? $matches[7] : null; |
|---|
| 50 | $fragment = !empty($matches[8]) ? $matches[9] : null; |
|---|
| 51 | |
|---|
| 52 | // further parse authority |
|---|
| 53 | if ($authority !== null) { |
|---|
| 54 | $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/"; |
|---|
| 55 | $matches = array(); |
|---|
| 56 | preg_match($r_authority, $authority, $matches); |
|---|
| 57 | $userinfo = !empty($matches[1]) ? $matches[2] : null; |
|---|
| 58 | $host = !empty($matches[3]) ? $matches[3] : ''; |
|---|
| 59 | $port = !empty($matches[4]) ? (int) $matches[5] : null; |
|---|
| 60 | } else { |
|---|
| 61 | $port = $host = $userinfo = null; |
|---|
| 62 | } |
|---|
| 63 | |
|---|
| 64 | return new HTMLPurifier_URI( |
|---|
| 65 | $scheme, $userinfo, $host, $port, $path, $query, $fragment); |
|---|
| 66 | } |
|---|
| 67 | |
|---|
| 68 | } |
|---|
| 69 | |
|---|