root/afridex/plugins/Flutter/purifier_lib/HTMLPurifier/Lexer.php @ 21

Revision 21, 9.9 kB (checked in by admin, 18 years ago)
Line 
1<?php
2
3/**
4 * Forgivingly lexes HTML (SGML-style) markup into tokens.
5 *
6 * A lexer parses a string of SGML-style markup and converts them into
7 * corresponding tokens.  It doesn't check for well-formedness, although its
8 * internal mechanism may make this automatic (such as the case of
9 * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose
10 * from.
11 *
12 * A lexer is HTML-oriented: it might work with XML, but it's not
13 * recommended, as we adhere to a subset of the specification for optimization
14 * reasons. This might change in the future. Also, most tokenizers are not
15 * expected to handle DTDs or PIs.
16 *
17 * This class should not be directly instantiated, but you may use create() to
18 * retrieve a default copy of the lexer.  Being a supertype, this class
19 * does not actually define any implementation, but offers commonly used
20 * convenience functions for subclasses.
21 *
22 * @note The unit tests will instantiate this class for testing purposes, as
23 *       many of the utility functions require a class to be instantiated.
24 *       This means that, even though this class is not runnable, it will
25 *       not be declared abstract.
26 *
27 * @par
28 *
29 * @note
30 * We use tokens rather than create a DOM representation because DOM would:
31 *
32 * @par
33 *  -# Require more processing and memory to create,
34 *  -# Is not streamable, and
35 *  -# Has the entire document structure (html and body not needed).
36 *
37 * @par
38 * However, DOM is helpful in that it makes it easy to move around nodes
39 * without a lot of lookaheads to see when a tag is closed. This is a
40 * limitation of the token system and some workarounds would be nice.
41 */
42class HTMLPurifier_Lexer
43{
44   
45    // -- STATIC ----------------------------------------------------------
46   
47    /**
48     * Retrieves or sets the default Lexer as a Prototype Factory.
49     *
50     * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
51     * a few exceptions involving special features that only DirectLex
52     * implements.
53     *
54     * @note The behavior of this class has changed, rather than accepting
55     *       a prototype object, it now accepts a configuration object.
56     *       To specify your own prototype, set %Core.LexerImpl to it.
57     *       This change in behavior de-singletonizes the lexer object.
58     *
59     * @param $config Instance of HTMLPurifier_Config
60     * @return Concrete lexer.
61     */
62    public static function create($config) {
63       
64        if (!($config instanceof HTMLPurifier_Config)) {
65            $lexer = $config;
66            trigger_error("Passing a prototype to
67              HTMLPurifier_Lexer::create() is deprecated, please instead
68              use %Core.LexerImpl", E_USER_WARNING);
69        } else {
70            $lexer = $config->get('Core', 'LexerImpl');
71        }
72       
73        if (is_object($lexer)) {
74            return $lexer;
75        }
76       
77        if (is_null($lexer)) { do {
78            // auto-detection algorithm
79           
80            // once PHP DOM implements native line numbers, or we
81            // hack out something using XSLT, remove this stipulation
82            $line_numbers = $config->get('Core', 'MaintainLineNumbers');
83            if (
84                $line_numbers === true ||
85                ($line_numbers === null && $config->get('Core', 'CollectErrors'))
86            ) {
87                $lexer = 'DirectLex';
88                break;
89            }
90           
91            if (class_exists('DOMDocument')) {
92                // check for DOM support, because, surprisingly enough,
93                // it's *not* part of the core!
94                $lexer = 'DOMLex';
95            } else {
96                $lexer = 'DirectLex';
97            }
98           
99        } while(0); } // do..while so we can break
100       
101        // instantiate recognized string names
102        switch ($lexer) {
103            case 'DOMLex':
104                return new HTMLPurifier_Lexer_DOMLex();
105            case 'DirectLex':
106                return new HTMLPurifier_Lexer_DirectLex();
107            case 'PH5P':
108                return new HTMLPurifier_Lexer_PH5P();
109            default:
110                trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR);
111        }
112       
113    }
114   
115    // -- CONVENIENCE MEMBERS ---------------------------------------------
116   
117    public function __construct() {
118        $this->_entity_parser = new HTMLPurifier_EntityParser();
119    }
120   
121    /**
122     * Most common entity to raw value conversion table for special entities.
123     */
124    protected $_special_entity2str =
125            array(
126                    '&quot;' => '"',
127                    '&amp;'  => '&',
128                    '&lt;'   => '<',
129                    '&gt;'   => '>',
130                    '&#39;'  => "'",
131                    '&#039;' => "'",
132                    '&#x27;' => "'"
133            );
134   
135    /**
136     * Parses special entities into the proper characters.
137     *
138     * This string will translate escaped versions of the special characters
139     * into the correct ones.
140     *
141     * @warning
142     * You should be able to treat the output of this function as
143     * completely parsed, but that's only because all other entities should
144     * have been handled previously in substituteNonSpecialEntities()
145     *
146     * @param $string String character data to be parsed.
147     * @returns Parsed character data.
148     */
149    public function parseData($string) {
150       
151        // following functions require at least one character
152        if ($string === '') return '';
153       
154        // subtracts amps that cannot possibly be escaped
155        $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
156            ($string[strlen($string)-1] === '&' ? 1 : 0);
157       
158        if (!$num_amp) return $string; // abort if no entities
159        $num_esc_amp = substr_count($string, '&amp;');
160        $string = strtr($string, $this->_special_entity2str);
161       
162        // code duplication for sake of optimization, see above
163        $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
164            ($string[strlen($string)-1] === '&' ? 1 : 0);
165       
166        if ($num_amp_2 <= $num_esc_amp) return $string;
167       
168        // hmm... now we have some uncommon entities. Use the callback.
169        $string = $this->_entity_parser->substituteSpecialEntities($string);
170        return $string;
171    }
172   
173    /**
174     * Lexes an HTML string into tokens.
175     *
176     * @param $string String HTML.
177     * @return HTMLPurifier_Token array representation of HTML.
178     */
179    public function tokenizeHTML($string, $config, $context) {
180        trigger_error('Call to abstract class', E_USER_ERROR);
181    }
182   
183    /**
184     * Translates CDATA sections into regular sections (through escaping).
185     *
186     * @param $string HTML string to process.
187     * @returns HTML with CDATA sections escaped.
188     */
189    protected static function escapeCDATA($string) {
190        return preg_replace_callback(
191            '/<!\[CDATA\[(.+?)\]\]>/s',
192            array('HTMLPurifier_Lexer', 'CDATACallback'),
193            $string
194        );
195    }
196   
197    /**
198     * Special CDATA case that is especially convoluted for <script>
199     */
200    protected static function escapeCommentedCDATA($string) {
201        return preg_replace_callback(
202            '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
203            array('HTMLPurifier_Lexer', 'CDATACallback'),
204            $string
205        );
206    }
207   
208    /**
209     * Callback function for escapeCDATA() that does the work.
210     *
211     * @warning Though this is public in order to let the callback happen,
212     *          calling it directly is not recommended.
213     * @params $matches PCRE matches array, with index 0 the entire match
214     *                  and 1 the inside of the CDATA section.
215     * @returns Escaped internals of the CDATA section.
216     */
217    protected static function CDATACallback($matches) {
218        // not exactly sure why the character set is needed, but whatever
219        return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
220    }
221   
222    /**
223     * Takes a piece of HTML and normalizes it by converting entities, fixing
224     * encoding, extracting bits, and other good stuff.
225     * @todo Consider making protected
226     */
227    public function normalize($html, $config, $context) {
228       
229        // extract body from document if applicable
230        if ($config->get('Core', 'ConvertDocumentToFragment')) {
231            $html = $this->extractBody($html);
232        }
233       
234        // normalize newlines to \n
235        $html = str_replace("\r\n", "\n", $html);
236        $html = str_replace("\r", "\n", $html);
237       
238        if ($config->get('HTML', 'Trusted')) {
239            // escape convoluted CDATA
240            $html = $this->escapeCommentedCDATA($html);
241        }
242       
243        // escape CDATA
244        $html = $this->escapeCDATA($html);
245       
246        // expand entities that aren't the big five
247        $html = $this->_entity_parser->substituteNonSpecialEntities($html);
248       
249        // clean into wellformed UTF-8 string for an SGML context: this has
250        // to be done after entity expansion because the entities sometimes
251        // represent non-SGML characters (horror, horror!)
252        $html = HTMLPurifier_Encoder::cleanUTF8($html);
253       
254        return $html;
255    }
256   
257    /**
258     * Takes a string of HTML (fragment or document) and returns the content
259     * @todo Consider making protected
260     */
261    public function extractBody($html) {
262        $matches = array();
263        $result = preg_match('!<body[^>]*>(.+?)</body>!is', $html, $matches);
264        if ($result) {
265            return $matches[1];
266        } else {
267            return $html;
268        }
269    }
270   
271}
272
Note: See TracBrowser for help on using the browser.