| 1 | <?php |
|---|
| 2 | |
|---|
| 3 | /** |
|---|
| 4 | * Forgivingly lexes HTML (SGML-style) markup into tokens. |
|---|
| 5 | * |
|---|
| 6 | * A lexer parses a string of SGML-style markup and converts them into |
|---|
| 7 | * corresponding tokens. It doesn't check for well-formedness, although its |
|---|
| 8 | * internal mechanism may make this automatic (such as the case of |
|---|
| 9 | * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose |
|---|
| 10 | * from. |
|---|
| 11 | * |
|---|
| 12 | * A lexer is HTML-oriented: it might work with XML, but it's not |
|---|
| 13 | * recommended, as we adhere to a subset of the specification for optimization |
|---|
| 14 | * reasons. This might change in the future. Also, most tokenizers are not |
|---|
| 15 | * expected to handle DTDs or PIs. |
|---|
| 16 | * |
|---|
| 17 | * This class should not be directly instantiated, but you may use create() to |
|---|
| 18 | * retrieve a default copy of the lexer. Being a supertype, this class |
|---|
| 19 | * does not actually define any implementation, but offers commonly used |
|---|
| 20 | * convenience functions for subclasses. |
|---|
| 21 | * |
|---|
| 22 | * @note The unit tests will instantiate this class for testing purposes, as |
|---|
| 23 | * many of the utility functions require a class to be instantiated. |
|---|
| 24 | * This means that, even though this class is not runnable, it will |
|---|
| 25 | * not be declared abstract. |
|---|
| 26 | * |
|---|
| 27 | * @par |
|---|
| 28 | * |
|---|
| 29 | * @note |
|---|
| 30 | * We use tokens rather than create a DOM representation because DOM would: |
|---|
| 31 | * |
|---|
| 32 | * @par |
|---|
| 33 | * -# Require more processing and memory to create, |
|---|
| 34 | * -# Is not streamable, and |
|---|
| 35 | * -# Has the entire document structure (html and body not needed). |
|---|
| 36 | * |
|---|
| 37 | * @par |
|---|
| 38 | * However, DOM is helpful in that it makes it easy to move around nodes |
|---|
| 39 | * without a lot of lookaheads to see when a tag is closed. This is a |
|---|
| 40 | * limitation of the token system and some workarounds would be nice. |
|---|
| 41 | */ |
|---|
| 42 | class HTMLPurifier_Lexer |
|---|
| 43 | { |
|---|
| 44 | |
|---|
| 45 | // -- STATIC ---------------------------------------------------------- |
|---|
| 46 | |
|---|
| 47 | /** |
|---|
| 48 | * Retrieves or sets the default Lexer as a Prototype Factory. |
|---|
| 49 | * |
|---|
| 50 | * By default HTMLPurifier_Lexer_DOMLex will be returned. There are |
|---|
| 51 | * a few exceptions involving special features that only DirectLex |
|---|
| 52 | * implements. |
|---|
| 53 | * |
|---|
| 54 | * @note The behavior of this class has changed, rather than accepting |
|---|
| 55 | * a prototype object, it now accepts a configuration object. |
|---|
| 56 | * To specify your own prototype, set %Core.LexerImpl to it. |
|---|
| 57 | * This change in behavior de-singletonizes the lexer object. |
|---|
| 58 | * |
|---|
| 59 | * @param $config Instance of HTMLPurifier_Config |
|---|
| 60 | * @return Concrete lexer. |
|---|
| 61 | */ |
|---|
| 62 | public static function create($config) { |
|---|
| 63 | |
|---|
| 64 | if (!($config instanceof HTMLPurifier_Config)) { |
|---|
| 65 | $lexer = $config; |
|---|
| 66 | trigger_error("Passing a prototype to |
|---|
| 67 | HTMLPurifier_Lexer::create() is deprecated, please instead |
|---|
| 68 | use %Core.LexerImpl", E_USER_WARNING); |
|---|
| 69 | } else { |
|---|
| 70 | $lexer = $config->get('Core', 'LexerImpl'); |
|---|
| 71 | } |
|---|
| 72 | |
|---|
| 73 | if (is_object($lexer)) { |
|---|
| 74 | return $lexer; |
|---|
| 75 | } |
|---|
| 76 | |
|---|
| 77 | if (is_null($lexer)) { do { |
|---|
| 78 | // auto-detection algorithm |
|---|
| 79 | |
|---|
| 80 | // once PHP DOM implements native line numbers, or we |
|---|
| 81 | // hack out something using XSLT, remove this stipulation |
|---|
| 82 | $line_numbers = $config->get('Core', 'MaintainLineNumbers'); |
|---|
| 83 | if ( |
|---|
| 84 | $line_numbers === true || |
|---|
| 85 | ($line_numbers === null && $config->get('Core', 'CollectErrors')) |
|---|
| 86 | ) { |
|---|
| 87 | $lexer = 'DirectLex'; |
|---|
| 88 | break; |
|---|
| 89 | } |
|---|
| 90 | |
|---|
| 91 | if (class_exists('DOMDocument')) { |
|---|
| 92 | // check for DOM support, because, surprisingly enough, |
|---|
| 93 | // it's *not* part of the core! |
|---|
| 94 | $lexer = 'DOMLex'; |
|---|
| 95 | } else { |
|---|
| 96 | $lexer = 'DirectLex'; |
|---|
| 97 | } |
|---|
| 98 | |
|---|
| 99 | } while(0); } // do..while so we can break |
|---|
| 100 | |
|---|
| 101 | // instantiate recognized string names |
|---|
| 102 | switch ($lexer) { |
|---|
| 103 | case 'DOMLex': |
|---|
| 104 | return new HTMLPurifier_Lexer_DOMLex(); |
|---|
| 105 | case 'DirectLex': |
|---|
| 106 | return new HTMLPurifier_Lexer_DirectLex(); |
|---|
| 107 | case 'PH5P': |
|---|
| 108 | return new HTMLPurifier_Lexer_PH5P(); |
|---|
| 109 | default: |
|---|
| 110 | trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR); |
|---|
| 111 | } |
|---|
| 112 | |
|---|
| 113 | } |
|---|
| 114 | |
|---|
| 115 | // -- CONVENIENCE MEMBERS --------------------------------------------- |
|---|
| 116 | |
|---|
| 117 | public function __construct() { |
|---|
| 118 | $this->_entity_parser = new HTMLPurifier_EntityParser(); |
|---|
| 119 | } |
|---|
| 120 | |
|---|
| 121 | /** |
|---|
| 122 | * Most common entity to raw value conversion table for special entities. |
|---|
| 123 | */ |
|---|
| 124 | protected $_special_entity2str = |
|---|
| 125 | array( |
|---|
| 126 | '"' => '"', |
|---|
| 127 | '&' => '&', |
|---|
| 128 | '<' => '<', |
|---|
| 129 | '>' => '>', |
|---|
| 130 | ''' => "'", |
|---|
| 131 | ''' => "'", |
|---|
| 132 | ''' => "'" |
|---|
| 133 | ); |
|---|
| 134 | |
|---|
| 135 | /** |
|---|
| 136 | * Parses special entities into the proper characters. |
|---|
| 137 | * |
|---|
| 138 | * This string will translate escaped versions of the special characters |
|---|
| 139 | * into the correct ones. |
|---|
| 140 | * |
|---|
| 141 | * @warning |
|---|
| 142 | * You should be able to treat the output of this function as |
|---|
| 143 | * completely parsed, but that's only because all other entities should |
|---|
| 144 | * have been handled previously in substituteNonSpecialEntities() |
|---|
| 145 | * |
|---|
| 146 | * @param $string String character data to be parsed. |
|---|
| 147 | * @returns Parsed character data. |
|---|
| 148 | */ |
|---|
| 149 | public function parseData($string) { |
|---|
| 150 | |
|---|
| 151 | // following functions require at least one character |
|---|
| 152 | if ($string === '') return ''; |
|---|
| 153 | |
|---|
| 154 | // subtracts amps that cannot possibly be escaped |
|---|
| 155 | $num_amp = substr_count($string, '&') - substr_count($string, '& ') - |
|---|
| 156 | ($string[strlen($string)-1] === '&' ? 1 : 0); |
|---|
| 157 | |
|---|
| 158 | if (!$num_amp) return $string; // abort if no entities |
|---|
| 159 | $num_esc_amp = substr_count($string, '&'); |
|---|
| 160 | $string = strtr($string, $this->_special_entity2str); |
|---|
| 161 | |
|---|
| 162 | // code duplication for sake of optimization, see above |
|---|
| 163 | $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') - |
|---|
| 164 | ($string[strlen($string)-1] === '&' ? 1 : 0); |
|---|
| 165 | |
|---|
| 166 | if ($num_amp_2 <= $num_esc_amp) return $string; |
|---|
| 167 | |
|---|
| 168 | // hmm... now we have some uncommon entities. Use the callback. |
|---|
| 169 | $string = $this->_entity_parser->substituteSpecialEntities($string); |
|---|
| 170 | return $string; |
|---|
| 171 | } |
|---|
| 172 | |
|---|
| 173 | /** |
|---|
| 174 | * Lexes an HTML string into tokens. |
|---|
| 175 | * |
|---|
| 176 | * @param $string String HTML. |
|---|
| 177 | * @return HTMLPurifier_Token array representation of HTML. |
|---|
| 178 | */ |
|---|
| 179 | public function tokenizeHTML($string, $config, $context) { |
|---|
| 180 | trigger_error('Call to abstract class', E_USER_ERROR); |
|---|
| 181 | } |
|---|
| 182 | |
|---|
| 183 | /** |
|---|
| 184 | * Translates CDATA sections into regular sections (through escaping). |
|---|
| 185 | * |
|---|
| 186 | * @param $string HTML string to process. |
|---|
| 187 | * @returns HTML with CDATA sections escaped. |
|---|
| 188 | */ |
|---|
| 189 | protected static function escapeCDATA($string) { |
|---|
| 190 | return preg_replace_callback( |
|---|
| 191 | '/<!\[CDATA\[(.+?)\]\]>/s', |
|---|
| 192 | array('HTMLPurifier_Lexer', 'CDATACallback'), |
|---|
| 193 | $string |
|---|
| 194 | ); |
|---|
| 195 | } |
|---|
| 196 | |
|---|
| 197 | /** |
|---|
| 198 | * Special CDATA case that is especially convoluted for <script> |
|---|
| 199 | */ |
|---|
| 200 | protected static function escapeCommentedCDATA($string) { |
|---|
| 201 | return preg_replace_callback( |
|---|
| 202 | '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s', |
|---|
| 203 | array('HTMLPurifier_Lexer', 'CDATACallback'), |
|---|
| 204 | $string |
|---|
| 205 | ); |
|---|
| 206 | } |
|---|
| 207 | |
|---|
| 208 | /** |
|---|
| 209 | * Callback function for escapeCDATA() that does the work. |
|---|
| 210 | * |
|---|
| 211 | * @warning Though this is public in order to let the callback happen, |
|---|
| 212 | * calling it directly is not recommended. |
|---|
| 213 | * @params $matches PCRE matches array, with index 0 the entire match |
|---|
| 214 | * and 1 the inside of the CDATA section. |
|---|
| 215 | * @returns Escaped internals of the CDATA section. |
|---|
| 216 | */ |
|---|
| 217 | protected static function CDATACallback($matches) { |
|---|
| 218 | // not exactly sure why the character set is needed, but whatever |
|---|
| 219 | return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); |
|---|
| 220 | } |
|---|
| 221 | |
|---|
| 222 | /** |
|---|
| 223 | * Takes a piece of HTML and normalizes it by converting entities, fixing |
|---|
| 224 | * encoding, extracting bits, and other good stuff. |
|---|
| 225 | * @todo Consider making protected |
|---|
| 226 | */ |
|---|
| 227 | public function normalize($html, $config, $context) { |
|---|
| 228 | |
|---|
| 229 | // extract body from document if applicable |
|---|
| 230 | if ($config->get('Core', 'ConvertDocumentToFragment')) { |
|---|
| 231 | $html = $this->extractBody($html); |
|---|
| 232 | } |
|---|
| 233 | |
|---|
| 234 | // normalize newlines to \n |
|---|
| 235 | $html = str_replace("\r\n", "\n", $html); |
|---|
| 236 | $html = str_replace("\r", "\n", $html); |
|---|
| 237 | |
|---|
| 238 | if ($config->get('HTML', 'Trusted')) { |
|---|
| 239 | // escape convoluted CDATA |
|---|
| 240 | $html = $this->escapeCommentedCDATA($html); |
|---|
| 241 | } |
|---|
| 242 | |
|---|
| 243 | // escape CDATA |
|---|
| 244 | $html = $this->escapeCDATA($html); |
|---|
| 245 | |
|---|
| 246 | // expand entities that aren't the big five |
|---|
| 247 | $html = $this->_entity_parser->substituteNonSpecialEntities($html); |
|---|
| 248 | |
|---|
| 249 | // clean into wellformed UTF-8 string for an SGML context: this has |
|---|
| 250 | // to be done after entity expansion because the entities sometimes |
|---|
| 251 | // represent non-SGML characters (horror, horror!) |
|---|
| 252 | $html = HTMLPurifier_Encoder::cleanUTF8($html); |
|---|
| 253 | |
|---|
| 254 | return $html; |
|---|
| 255 | } |
|---|
| 256 | |
|---|
| 257 | /** |
|---|
| 258 | * Takes a string of HTML (fragment or document) and returns the content |
|---|
| 259 | * @todo Consider making protected |
|---|
| 260 | */ |
|---|
| 261 | public function extractBody($html) { |
|---|
| 262 | $matches = array(); |
|---|
| 263 | $result = preg_match('!<body[^>]*>(.+?)</body>!is', $html, $matches); |
|---|
| 264 | if ($result) { |
|---|
| 265 | return $matches[1]; |
|---|
| 266 | } else { |
|---|
| 267 | return $html; |
|---|
| 268 | } |
|---|
| 269 | } |
|---|
| 270 | |
|---|
| 271 | } |
|---|
| 272 | |
|---|