| 1 | <?php |
|---|
| 2 | |
|---|
| 3 | /** |
|---|
| 4 | * Injector that auto paragraphs text in the root node based on |
|---|
| 5 | * double-spacing. |
|---|
| 6 | */ |
|---|
| 7 | class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector |
|---|
| 8 | { |
|---|
| 9 | |
|---|
| 10 | public $name = 'AutoParagraph'; |
|---|
| 11 | public $needed = array('p'); |
|---|
| 12 | |
|---|
| 13 | private function _pStart() { |
|---|
| 14 | $par = new HTMLPurifier_Token_Start('p'); |
|---|
| 15 | $par->armor['MakeWellFormed_TagClosedError'] = true; |
|---|
| 16 | return $par; |
|---|
| 17 | } |
|---|
| 18 | |
|---|
| 19 | public function handleText(&$token) { |
|---|
| 20 | $text = $token->data; |
|---|
| 21 | if (empty($this->currentNesting)) { |
|---|
| 22 | if (!$this->allowsElement('p')) return; |
|---|
| 23 | // case 1: we're in root node (and it allows paragraphs) |
|---|
| 24 | $token = array($this->_pStart()); |
|---|
| 25 | $this->_splitText($text, $token); |
|---|
| 26 | } elseif ($this->currentNesting[count($this->currentNesting)-1]->name == 'p') { |
|---|
| 27 | // case 2: we're in a paragraph |
|---|
| 28 | $token = array(); |
|---|
| 29 | $this->_splitText($text, $token); |
|---|
| 30 | } elseif ($this->allowsElement('p')) { |
|---|
| 31 | // case 3: we're in an element that allows paragraphs |
|---|
| 32 | if (strpos($text, "\n\n") !== false) { |
|---|
| 33 | // case 3.1: this text node has a double-newline |
|---|
| 34 | $token = array($this->_pStart()); |
|---|
| 35 | $this->_splitText($text, $token); |
|---|
| 36 | } else { |
|---|
| 37 | $ok = false; |
|---|
| 38 | // test if up-coming tokens are either block or have |
|---|
| 39 | // a double newline in them |
|---|
| 40 | $nesting = 0; |
|---|
| 41 | for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) { |
|---|
| 42 | if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_Start){ |
|---|
| 43 | if (!$this->_isInline($this->inputTokens[$i])) { |
|---|
| 44 | // we haven't found a double-newline, and |
|---|
| 45 | // we've hit a block element, so don't paragraph |
|---|
| 46 | $ok = false; |
|---|
| 47 | break; |
|---|
| 48 | } |
|---|
| 49 | $nesting++; |
|---|
| 50 | } |
|---|
| 51 | if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_End) { |
|---|
| 52 | if ($nesting <= 0) break; |
|---|
| 53 | $nesting--; |
|---|
| 54 | } |
|---|
| 55 | if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_Text) { |
|---|
| 56 | // found it! |
|---|
| 57 | if (strpos($this->inputTokens[$i]->data, "\n\n") !== false) { |
|---|
| 58 | $ok = true; |
|---|
| 59 | break; |
|---|
| 60 | } |
|---|
| 61 | } |
|---|
| 62 | } |
|---|
| 63 | if ($ok) { |
|---|
| 64 | // case 3.2: this text node is next to another node |
|---|
| 65 | // that will start a paragraph |
|---|
| 66 | $token = array($this->_pStart(), $token); |
|---|
| 67 | } |
|---|
| 68 | } |
|---|
| 69 | } |
|---|
| 70 | |
|---|
| 71 | } |
|---|
| 72 | |
|---|
| 73 | public function handleElement(&$token) { |
|---|
| 74 | // check if we're inside a tag already |
|---|
| 75 | if (!empty($this->currentNesting)) { |
|---|
| 76 | if ($this->allowsElement('p')) { |
|---|
| 77 | // special case: we're in an element that allows paragraphs |
|---|
| 78 | |
|---|
| 79 | // this token is already paragraph, abort |
|---|
| 80 | if ($token->name == 'p') return; |
|---|
| 81 | |
|---|
| 82 | // this token is a block level, abort |
|---|
| 83 | if (!$this->_isInline($token)) return; |
|---|
| 84 | |
|---|
| 85 | // check if this token is adjacent to the parent token |
|---|
| 86 | $prev = $this->inputTokens[$this->inputIndex - 1]; |
|---|
| 87 | if (!$prev instanceof HTMLPurifier_Token_Start) { |
|---|
| 88 | // not adjacent, we can abort early |
|---|
| 89 | // add lead paragraph tag if our token is inline |
|---|
| 90 | // and the previous tag was an end paragraph |
|---|
| 91 | if ( |
|---|
| 92 | $prev->name == 'p' && $prev instanceof HTMLPurifier_Token_End && |
|---|
| 93 | $this->_isInline($token) |
|---|
| 94 | ) { |
|---|
| 95 | $token = array($this->_pStart(), $token); |
|---|
| 96 | } |
|---|
| 97 | return; |
|---|
| 98 | } |
|---|
| 99 | |
|---|
| 100 | // this token is the first child of the element that allows |
|---|
| 101 | // paragraph. We have to peek ahead and see whether or not |
|---|
| 102 | // there is anything inside that suggests that a paragraph |
|---|
| 103 | // will be needed |
|---|
| 104 | $ok = false; |
|---|
| 105 | // maintain a mini-nesting counter, this lets us bail out |
|---|
| 106 | // early if possible |
|---|
| 107 | $j = 1; // current nesting, one is due to parent (we recalculate current token) |
|---|
| 108 | for ($i = $this->inputIndex; isset($this->inputTokens[$i]); $i++) { |
|---|
| 109 | if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_Start) $j++; |
|---|
| 110 | if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_End) $j--; |
|---|
| 111 | if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_Text) { |
|---|
| 112 | if (strpos($this->inputTokens[$i]->data, "\n\n") !== false) { |
|---|
| 113 | $ok = true; |
|---|
| 114 | break; |
|---|
| 115 | } |
|---|
| 116 | } |
|---|
| 117 | if ($j <= 0) break; |
|---|
| 118 | } |
|---|
| 119 | if ($ok) { |
|---|
| 120 | $token = array($this->_pStart(), $token); |
|---|
| 121 | } |
|---|
| 122 | } |
|---|
| 123 | return; |
|---|
| 124 | } |
|---|
| 125 | |
|---|
| 126 | // check if the start tag counts as a "block" element |
|---|
| 127 | if (!$this->_isInline($token)) return; |
|---|
| 128 | |
|---|
| 129 | // append a paragraph tag before the token |
|---|
| 130 | $token = array($this->_pStart(), $token); |
|---|
| 131 | } |
|---|
| 132 | |
|---|
| 133 | /** |
|---|
| 134 | * Splits up a text in paragraph tokens and appends them |
|---|
| 135 | * to the result stream that will replace the original |
|---|
| 136 | * @param $data String text data that will be processed |
|---|
| 137 | * into paragraphs |
|---|
| 138 | * @param $result Reference to array of tokens that the |
|---|
| 139 | * tags will be appended onto |
|---|
| 140 | * @param $config Instance of HTMLPurifier_Config |
|---|
| 141 | * @param $context Instance of HTMLPurifier_Context |
|---|
| 142 | */ |
|---|
| 143 | private function _splitText($data, &$result) { |
|---|
| 144 | $raw_paragraphs = explode("\n\n", $data); |
|---|
| 145 | |
|---|
| 146 | // remove empty paragraphs |
|---|
| 147 | $paragraphs = array(); |
|---|
| 148 | $needs_start = false; |
|---|
| 149 | $needs_end = false; |
|---|
| 150 | |
|---|
| 151 | $c = count($raw_paragraphs); |
|---|
| 152 | if ($c == 1) { |
|---|
| 153 | // there were no double-newlines, abort quickly |
|---|
| 154 | $result[] = new HTMLPurifier_Token_Text($data); |
|---|
| 155 | return; |
|---|
| 156 | } |
|---|
| 157 | |
|---|
| 158 | for ($i = 0; $i < $c; $i++) { |
|---|
| 159 | $par = $raw_paragraphs[$i]; |
|---|
| 160 | if (trim($par) !== '') { |
|---|
| 161 | $paragraphs[] = $par; |
|---|
| 162 | continue; |
|---|
| 163 | } |
|---|
| 164 | if ($i == 0 && empty($result)) { |
|---|
| 165 | // The empty result indicates that the AutoParagraph |
|---|
| 166 | // injector did not add any start paragraph tokens. |
|---|
| 167 | // The fact that the first paragraph is empty indicates |
|---|
| 168 | // that there was a double-newline at the start of the |
|---|
| 169 | // data. |
|---|
| 170 | // Combined together, this means that we are in a paragraph, |
|---|
| 171 | // and the newline means we should start a new one. |
|---|
| 172 | $result[] = new HTMLPurifier_Token_End('p'); |
|---|
| 173 | // However, the start token should only be added if |
|---|
| 174 | // there is more processing to be done (i.e. there are |
|---|
| 175 | // real paragraphs in here). If there are none, the |
|---|
| 176 | // next start paragraph tag will be handled by the |
|---|
| 177 | // next run-around the injector |
|---|
| 178 | $needs_start = true; |
|---|
| 179 | } elseif ($i + 1 == $c) { |
|---|
| 180 | // a double-paragraph at the end indicates that |
|---|
| 181 | // there is an overriding need to start a new paragraph |
|---|
| 182 | // for the next section. This has no effect until |
|---|
| 183 | // we've processed all of the other paragraphs though |
|---|
| 184 | $needs_end = true; |
|---|
| 185 | } |
|---|
| 186 | } |
|---|
| 187 | |
|---|
| 188 | // check if there are no "real" paragraphs to be processed |
|---|
| 189 | if (empty($paragraphs)) { |
|---|
| 190 | return; |
|---|
| 191 | } |
|---|
| 192 | |
|---|
| 193 | // add a start tag if an end tag was added while processing |
|---|
| 194 | // the raw paragraphs (that happens if there's a leading double |
|---|
| 195 | // newline) |
|---|
| 196 | if ($needs_start) $result[] = $this->_pStart(); |
|---|
| 197 | |
|---|
| 198 | // append the paragraphs onto the result |
|---|
| 199 | foreach ($paragraphs as $par) { |
|---|
| 200 | $result[] = new HTMLPurifier_Token_Text($par); |
|---|
| 201 | $result[] = new HTMLPurifier_Token_End('p'); |
|---|
| 202 | $result[] = $this->_pStart(); |
|---|
| 203 | } |
|---|
| 204 | |
|---|
| 205 | // remove trailing start token, if one is needed, it will |
|---|
| 206 | // be handled the next time this injector is called |
|---|
| 207 | array_pop($result); |
|---|
| 208 | |
|---|
| 209 | // check the outside to determine whether or not the |
|---|
| 210 | // end paragraph tag should be removed. It should be removed |
|---|
| 211 | // unless the next non-whitespace token is a paragraph |
|---|
| 212 | // or a block element. |
|---|
| 213 | $remove_paragraph_end = true; |
|---|
| 214 | |
|---|
| 215 | if (!$needs_end) { |
|---|
| 216 | // Start of the checks one after the current token's index |
|---|
| 217 | for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) { |
|---|
| 218 | if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_Start || $this->inputTokens[$i] instanceof HTMLPurifier_Token_Empty) { |
|---|
| 219 | $remove_paragraph_end = $this->_isInline($this->inputTokens[$i]); |
|---|
| 220 | } |
|---|
| 221 | // check if we can abort early (whitespace means we carry-on!) |
|---|
| 222 | if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_Text && !$this->inputTokens[$i]->is_whitespace) break; |
|---|
| 223 | // end tags will automatically be handled by MakeWellFormed, |
|---|
| 224 | // so we don't have to worry about them |
|---|
| 225 | if ($this->inputTokens[$i] instanceof HTMLPurifier_Token_End) break; |
|---|
| 226 | } |
|---|
| 227 | } else { |
|---|
| 228 | $remove_paragraph_end = false; |
|---|
| 229 | } |
|---|
| 230 | |
|---|
| 231 | // check the outside to determine whether or not the |
|---|
| 232 | // end paragraph tag should be removed |
|---|
| 233 | if ($remove_paragraph_end) { |
|---|
| 234 | array_pop($result); |
|---|
| 235 | } |
|---|
| 236 | |
|---|
| 237 | } |
|---|
| 238 | |
|---|
| 239 | /** |
|---|
| 240 | * Returns true if passed token is inline (and, ergo, allowed in |
|---|
| 241 | * paragraph tags) |
|---|
| 242 | */ |
|---|
| 243 | private function _isInline($token) { |
|---|
| 244 | return isset($this->htmlDefinition->info['p']->child->elements[$token->name]); |
|---|
| 245 | } |
|---|
| 246 | |
|---|
| 247 | } |
|---|
| 248 | |
|---|