author | Dan |
Thu, 28 Jun 2007 15:26:40 -0400 | |
changeset 31 | dc8741857bde |
parent 24 | 9ecc94c4c7f5 |
child 73 | 0a74676a2f2f |
permissions | -rw-r--r-- |
1 | 1 |
<?php |
2 |
||
3 |
/** |
|
4 |
* Enano - an open-source CMS capable of wiki functions, Drupal-like sidebar blocks, and everything in between |
|
21
663fcf528726
Updated all version numbers back to Banshee; a few preliminary steps towards full UTF-8 support in page URLs
Dan
parents:
16
diff
changeset
|
5 |
* Version 1.0 (Banshee) |
1 | 6 |
* Copyright (C) 2006-2007 Dan Fuhry |
7 |
* |
|
8 |
* This program is Free Software; you can redistribute and/or modify it under the terms of the GNU General Public License |
|
9 |
* as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. |
|
10 |
* |
|
11 |
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied |
|
12 |
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details. |
|
13 |
* |
|
14 |
* This script contains code originally found in MediaWiki (http://www.mediawiki.org). MediaWiki is also licensed under |
|
15 |
* the GPLv2; see the file GPL included with this package for details. |
|
16 |
* |
|
17 |
* We're using the MW parser because the Text_Wiki version simply refused to work under PHP 5.2.0. Porting this was |
|
18 |
* _not_ easy. <leaves to get cup of coffee> |
|
19 |
*/ |
|
20 |
||
21 |
global $mStripState, $wgRandomKey; |
|
22 |
$mStripState = Array(); |
|
23 |
||
24 |
$attrib = '[a-zA-Z0-9]'; |
|
25 |
$space = '[\x09\x0a\x0d\x20]'; |
|
26 |
||
27 |
define( 'MW_CHAR_REFS_REGEX', |
|
28 |
'/&([A-Za-z0-9]+); |
|
29 |
|&\#([0-9]+); |
|
30 |
|&\#x([0-9A-Za-z]+); |
|
31 |
|&\#X([0-9A-Za-z]+); |
|
32 |
|(&)/x' ); |
|
33 |
||
34 |
define( 'MW_ATTRIBS_REGEX', |
|
35 |
"/(?:^|$space)($attrib+) |
|
36 |
($space*=$space* |
|
37 |
(?: |
|
38 |
# The attribute value: quoted or alone |
|
39 |
".'"'."([^<".'"'."]*)".'"'." |
|
40 |
| '([^<']*)' |
|
41 |
| ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) |
|
42 |
| (\#[0-9a-fA-F]+) # Technically wrong, but lots of |
|
43 |
# colors are specified like this. |
|
44 |
# We'll be normalizing it. |
|
45 |
) |
|
46 |
)?(?=$space|\$)/sx" ); |
|
47 |
||
48 |
/** |
|
49 |
* emulate mediawiki parser, including stripping, etc. |
|
50 |
* |
|
51 |
* @param string $text the text to parse |
|
52 |
* @return string |
|
53 |
* @access public |
|
54 |
*/ |
|
55 |
||
56 |
function process_tables( $text ) |
|
57 |
{ |
|
58 |
// include some globals, do some parser stuff that would normally be done in the parent parser function |
|
59 |
global $mStripState; |
|
60 |
$x =& $mStripState; |
|
61 |
//$text = mwStrip( $text, $x ); |
|
62 |
||
63 |
// parse the text |
|
64 |
$text = doTableStuff($text); |
|
65 |
||
66 |
// Unstrip it |
|
67 |
// $text = unstrip( $text, $mStripState ); |
|
68 |
// $text = unstripNoWiki( $text, $mStripState ); |
|
69 |
//die('<pre>'.print_r($mStripState, true).'</pre>'); |
|
70 |
return $text; |
|
71 |
} |
|
72 |
||
73 |
/** |
|
74 |
* parse the wiki syntax used to render tables |
|
75 |
* |
|
76 |
* @param string $t the text to parse |
|
77 |
* @return string |
|
78 |
* @access private |
|
79 |
*/ |
|
80 |
function doTableStuff( $t ) { |
|
81 |
||
82 |
$t = explode ( "\n" , $t ) ; |
|
83 |
$td = array () ; # Is currently a td tag open? |
|
84 |
$ltd = array () ; # Was it TD or TH? |
|
85 |
$tr = array () ; # Is currently a tr tag open? |
|
86 |
$ltr = array () ; # tr attributes |
|
87 |
$has_opened_tr = array(); # Did this table open a <tr> element? |
|
88 |
$indent_level = 0; # indent level of the table |
|
89 |
foreach ( $t AS $k => $x ) |
|
90 |
{ |
|
91 |
$x = trim ( $x ) ; |
|
92 |
$fc = substr ( $x , 0 , 1 ) ; |
|
93 |
if ( preg_match( '/^(:*)\{\|(.*)$/', $x, $matches ) ) { |
|
94 |
$indent_level = strlen( $matches[1] ); |
|
95 |
||
96 |
$attributes = unstripForHTML( $matches[2] ); |
|
97 |
||
98 |
$t[$k] = str_repeat( '<dl><dd>', $indent_level ) . |
|
99 |
'<nowiki><table' . fixTagAttributes( $attributes, 'table' ) . '></nowiki>' ; |
|
100 |
array_push ( $td , false ) ; |
|
101 |
array_push ( $ltd , '' ) ; |
|
102 |
array_push ( $tr , false ) ; |
|
103 |
array_push ( $ltr , '' ) ; |
|
104 |
array_push ( $has_opened_tr, false ); |
|
105 |
} |
|
106 |
else if ( count ( $td ) == 0 ) { } # Don't do any of the following |
|
107 |
else if ( '|}' == substr ( $x , 0 , 2 ) ) { |
|
108 |
$z = "<nowiki></table></nowiki>" . substr ( $x , 2); |
|
109 |
$l = array_pop ( $ltd ) ; |
|
110 |
if ( !array_pop ( $has_opened_tr ) ) $z = "<nowiki><tr><td></td></tr></nowiki>" . $z ; |
|
111 |
if ( array_pop ( $tr ) ) $z = '<nowiki></tr></nowiki>' . $z ; |
|
112 |
if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ; |
|
113 |
array_pop ( $ltr ) ; |
|
114 |
$t[$k] = $z . str_repeat( '<nowiki></dd></dl></nowiki>', $indent_level ); |
|
115 |
} |
|
116 |
else if ( '|-' == substr ( $x , 0 , 2 ) ) { # Allows for |--------------- |
|
117 |
$x = substr ( $x , 1 ) ; |
|
118 |
while ( $x != '' && substr ( $x , 0 , 1 ) == '-' ) $x = substr ( $x , 1 ) ; |
|
119 |
$z = '' ; |
|
120 |
$l = array_pop ( $ltd ) ; |
|
121 |
array_pop ( $has_opened_tr ); |
|
122 |
array_push ( $has_opened_tr , true ) ; |
|
123 |
if ( array_pop ( $tr ) ) $z = '<nowiki></tr></nowiki>' . $z ; |
|
124 |
if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ; |
|
125 |
array_pop ( $ltr ) ; |
|
126 |
$t[$k] = $z ; |
|
127 |
array_push ( $tr , false ) ; |
|
128 |
array_push ( $td , false ) ; |
|
129 |
array_push ( $ltd , '' ) ; |
|
130 |
$attributes = unstripForHTML( $x ); |
|
131 |
array_push ( $ltr , fixTagAttributes( $attributes, 'tr' ) ) ; |
|
132 |
} |
|
133 |
else if ( '|' == $fc || '!' == $fc || '|+' == substr ( $x , 0 , 2 ) ) { # Caption |
|
134 |
# $x is a table row |
|
135 |
if ( '|+' == substr ( $x , 0 , 2 ) ) { |
|
136 |
$fc = '+' ; |
|
137 |
$x = substr ( $x , 1 ) ; |
|
138 |
} |
|
139 |
$after = substr ( $x , 1 ) ; |
|
140 |
if ( $fc == '!' ) $after = str_replace ( '!!' , '||' , $after ) ; |
|
141 |
||
142 |
// Split up multiple cells on the same line. |
|
143 |
// FIXME: This can result in improper nesting of tags processed |
|
144 |
// by earlier parser steps, but should avoid splitting up eg |
|
145 |
// attribute values containing literal "||". |
|
146 |
$after = wfExplodeMarkup( '||', $after ); |
|
147 |
||
148 |
$t[$k] = '' ; |
|
149 |
||
150 |
# Loop through each table cell |
|
151 |
foreach ( $after AS $theline ) |
|
152 |
{ |
|
153 |
$z = '' ; |
|
154 |
if ( $fc != '+' ) |
|
155 |
{ |
|
156 |
$tra = array_pop ( $ltr ) ; |
|
157 |
if ( !array_pop ( $tr ) ) $z = '<nowiki><tr'.$tra."></nowiki>\n" ; |
|
158 |
array_push ( $tr , true ) ; |
|
159 |
array_push ( $ltr , '' ) ; |
|
160 |
array_pop ( $has_opened_tr ); |
|
161 |
array_push ( $has_opened_tr , true ) ; |
|
162 |
} |
|
163 |
||
164 |
$l = array_pop ( $ltd ) ; |
|
165 |
if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ; |
|
166 |
if ( $fc == '|' ) $l = 'td' ; |
|
167 |
else if ( $fc == '!' ) $l = 'th' ; |
|
168 |
else if ( $fc == '+' ) $l = 'caption' ; |
|
169 |
else $l = '' ; |
|
170 |
array_push ( $ltd , $l ) ; |
|
171 |
||
172 |
# Cell parameters |
|
173 |
$y = explode ( '|' , $theline , 2 ) ; |
|
174 |
# Note that a '|' inside an invalid link should not |
|
175 |
# be mistaken as delimiting cell parameters |
|
176 |
if ( strpos( $y[0], '[[' ) !== false ) { |
|
177 |
$y = array ($theline); |
|
178 |
} |
|
179 |
if ( count ( $y ) == 1 ) |
|
180 |
$y = "{$z}<nowiki><{$l}></nowiki>{$y[0]}" ; |
|
181 |
else { |
|
182 |
$attributes = unstripForHTML( $y[0] ); |
|
183 |
$y = "{$z}<nowiki><{$l}".fixTagAttributes($attributes, $l)."></nowiki>{$y[1]}" ; |
|
184 |
} |
|
185 |
$t[$k] .= $y ; |
|
186 |
array_push ( $td , true ) ; |
|
187 |
} |
|
188 |
} |
|
189 |
} |
|
190 |
||
191 |
# Closing open td, tr && table |
|
192 |
while ( count ( $td ) > 0 ) |
|
193 |
{ |
|
194 |
$l = array_pop ( $ltd ) ; |
|
195 |
if ( array_pop ( $td ) ) $t[] = '<nowiki></td></nowiki>' ; |
|
196 |
if ( array_pop ( $tr ) ) $t[] = '<nowiki></tr></nowiki>' ; |
|
197 |
if ( !array_pop ( $has_opened_tr ) ) $t[] = "<nowiki><tr><td></td></tr></nowiki>" ; |
|
198 |
$t[] = '<nowiki></table></nowiki>' ; |
|
199 |
} |
|
200 |
||
201 |
$t = implode ( "\n" , $t ) ; |
|
202 |
||
203 |
# special case: don't return empty table |
|
204 |
if($t == "<nowiki><table></nowiki>\n<nowiki><tr><td></td></tr></nowiki>\n<nowiki></table></nowiki>") |
|
205 |
$t = ''; |
|
206 |
return $t ; |
|
207 |
} |
|
208 |
||
209 |
/** |
|
210 |
* Take a tag soup fragment listing an HTML element's attributes |
|
211 |
* and normalize it to well-formed XML, discarding unwanted attributes. |
|
212 |
* Output is safe for further wikitext processing, with escaping of |
|
213 |
* values that could trigger problems. |
|
214 |
* |
|
215 |
* - Normalizes attribute names to lowercase |
|
216 |
* - Discards attributes not on a whitelist for the given element |
|
217 |
* - Turns broken or invalid entities into plaintext |
|
218 |
* - Double-quotes all attribute values |
|
219 |
* - Attributes without values are given the name as attribute |
|
220 |
* - Double attributes are discarded |
|
221 |
* - Unsafe style attributes are discarded |
|
222 |
* - Prepends space if there are attributes. |
|
223 |
* |
|
224 |
* @param string $text |
|
225 |
* @param string $element |
|
226 |
* @return string |
|
227 |
*/ |
|
228 |
function fixTagAttributes( $text, $element ) { |
|
229 |
if( trim( $text ) == '' ) { |
|
230 |
return ''; |
|
231 |
} |
|
232 |
||
233 |
$stripped = validateTagAttributes( |
|
234 |
decodeTagAttributes( $text ), $element ); |
|
235 |
||
236 |
$attribs = array(); |
|
237 |
foreach( $stripped as $attribute => $value ) { |
|
238 |
$encAttribute = htmlspecialchars( $attribute ); |
|
239 |
$encValue = safeEncodeAttribute( $value ); |
|
240 |
||
241 |
$attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // " |
|
242 |
} |
|
243 |
return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; |
|
244 |
} |
|
245 |
||
246 |
/** |
|
247 |
* Encode an attribute value for HTML tags, with extra armoring |
|
248 |
* against further wiki processing. |
|
249 |
* @param $text |
|
250 |
* @return HTML-encoded text fragment |
|
251 |
*/ |
|
252 |
function safeEncodeAttribute( $text ) { |
|
253 |
$encValue= encodeAttribute( $text ); |
|
254 |
||
255 |
# Templates and links may be expanded in later parsing, |
|
256 |
# creating invalid or dangerous output. Suppress this. |
|
257 |
$encValue = strtr( $encValue, array( |
|
258 |
'<' => '<', // This should never happen, |
|
259 |
'>' => '>', // we've received invalid input |
|
260 |
'"' => '"', // which should have been escaped. |
|
261 |
'{' => '{', |
|
262 |
'[' => '[', |
|
263 |
"''" => '''', |
|
264 |
'ISBN' => 'ISBN', |
|
265 |
'RFC' => 'RFC', |
|
266 |
'PMID' => 'PMID', |
|
267 |
'|' => '|', |
|
268 |
'__' => '__', |
|
269 |
) ); |
|
270 |
||
271 |
return $encValue; |
|
272 |
} |
|
273 |
||
274 |
/** |
|
275 |
* Encode an attribute value for HTML output. |
|
276 |
* @param $text |
|
277 |
* @return HTML-encoded text fragment |
|
278 |
*/ |
|
279 |
function encodeAttribute( $text ) { |
|
280 |
$encValue = htmlspecialchars( $text ); |
|
281 |
||
282 |
// Whitespace is normalized during attribute decoding, |
|
283 |
// so if we've been passed non-spaces we must encode them |
|
284 |
// ahead of time or they won't be preserved. |
|
285 |
$encValue = strtr( $encValue, array( |
|
286 |
"\n" => ' ', |
|
287 |
"\r" => ' ', |
|
288 |
"\t" => '	', |
|
289 |
) ); |
|
290 |
||
291 |
return $encValue; |
|
292 |
} |
|
293 |
||
294 |
function unstripForHTML( $text ) { |
|
295 |
global $mStripState; |
|
296 |
$text = unstrip( $text, $mStripState ); |
|
297 |
$text = unstripNoWiki( $text, $mStripState ); |
|
298 |
return $text; |
|
299 |
} |
|
300 |
||
301 |
/** |
|
302 |
* Always call this after unstrip() to preserve the order |
|
303 |
* |
|
304 |
* @private |
|
305 |
*/ |
|
306 |
function unstripNoWiki( $text, &$state ) { |
|
307 |
if ( !isset( $state['nowiki'] ) ) { |
|
308 |
return $text; |
|
309 |
} |
|
310 |
||
311 |
# TODO: good candidate for FSS |
|
312 |
$text = strtr( $text, $state['nowiki'] ); |
|
313 |
||
314 |
return $text; |
|
315 |
} |
|
316 |
||
317 |
/** |
|
318 |
* Take an array of attribute names and values and normalize or discard |
|
319 |
* illegal values for the given element type. |
|
320 |
* |
|
321 |
* - Discards attributes not on a whitelist for the given element |
|
322 |
* - Unsafe style attributes are discarded |
|
323 |
* |
|
324 |
* @param array $attribs |
|
325 |
* @param string $element |
|
326 |
* @return array |
|
327 |
* |
|
328 |
* @todo Check for legal values where the DTD limits things. |
|
329 |
* @todo Check for unique id attribute :P |
|
330 |
*/ |
|
331 |
function validateTagAttributes( $attribs, $element ) { |
|
332 |
$whitelist = array_flip( attributeWhitelist( $element ) ); |
|
333 |
$out = array(); |
|
334 |
foreach( $attribs as $attribute => $value ) { |
|
335 |
if( !isset( $whitelist[$attribute] ) ) { |
|
336 |
continue; |
|
337 |
} |
|
338 |
# Strip javascript "expression" from stylesheets. |
|
339 |
# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp |
|
340 |
if( $attribute == 'style' ) { |
|
341 |
$value = checkCss( $value ); |
|
342 |
if( $value === false ) { |
|
343 |
# haxx0r |
|
344 |
continue; |
|
345 |
} |
|
346 |
} |
|
347 |
||
348 |
if ( $attribute === 'id' ) |
|
349 |
$value = escapeId( $value ); |
|
350 |
||
351 |
// If this attribute was previously set, override it. |
|
352 |
// Output should only have one attribute of each name. |
|
353 |
$out[$attribute] = $value; |
|
354 |
} |
|
355 |
return $out; |
|
356 |
} |
|
357 |
||
358 |
/** |
|
359 |
* Pick apart some CSS and check it for forbidden or unsafe structures. |
|
360 |
* Returns a sanitized string, or false if it was just too evil. |
|
361 |
* |
|
362 |
* Currently URL references, 'expression', 'tps' are forbidden. |
|
363 |
* |
|
364 |
* @param string $value |
|
365 |
* @return mixed |
|
366 |
*/ |
|
367 |
function checkCss( $value ) { |
|
368 |
$stripped = decodeCharReferences( $value ); |
|
369 |
||
370 |
// Remove any comments; IE gets token splitting wrong |
|
371 |
$stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped ); |
|
372 |
$value = $stripped; |
|
373 |
||
374 |
// ... and continue checks |
|
375 |
$stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', |
|
376 |
'codepointToUtf8(hexdec("$1"))', $stripped ); |
|
377 |
$stripped = str_replace( '\\', '', $stripped ); |
|
378 |
if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is', |
|
379 |
$stripped ) ) { |
|
380 |
# haxx0r |
|
381 |
return false; |
|
382 |
} |
|
383 |
||
384 |
return $value; |
|
385 |
} |
|
386 |
||
387 |
/** |
|
388 |
* Decode any character references, numeric or named entities, |
|
389 |
* in the text and return a UTF-8 string. |
|
390 |
* |
|
391 |
* @param string $text |
|
392 |
* @return string |
|
393 |
* @access public |
|
394 |
* @static |
|
395 |
*/ |
|
396 |
function decodeCharReferences( $text ) { |
|
397 |
return preg_replace_callback( |
|
398 |
MW_CHAR_REFS_REGEX, |
|
399 |
'decodeCharReferencesCallback', |
|
400 |
$text ); |
|
401 |
} |
|
402 |
||
403 |
/** |
|
404 |
* Fetch the whitelist of acceptable attributes for a given |
|
405 |
* element name. |
|
406 |
* |
|
407 |
* @param string $element |
|
408 |
* @return array |
|
409 |
*/ |
|
410 |
function attributeWhitelist( $element ) { |
|
411 |
static $list; |
|
412 |
if( !isset( $list ) ) { |
|
413 |
$list = setupAttributeWhitelist(); |
|
414 |
} |
|
415 |
return isset( $list[$element] ) |
|
416 |
? $list[$element] |
|
417 |
: array(); |
|
418 |
} |
|
419 |
||
420 |
/** |
|
421 |
* @todo Document it a bit |
|
422 |
* @return array |
|
423 |
*/ |
|
424 |
function setupAttributeWhitelist() { |
|
425 |
$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); |
|
426 |
$block = array_merge( $common, array( 'align' ) ); |
|
427 |
$tablealign = array( 'align', 'char', 'charoff', 'valign' ); |
|
428 |
$tablecell = array( 'abbr', |
|
429 |
'axis', |
|
430 |
'headers', |
|
431 |
'scope', |
|
432 |
'rowspan', |
|
433 |
'colspan', |
|
434 |
'nowrap', # deprecated |
|
435 |
'width', # deprecated |
|
436 |
'height', # deprecated |
|
437 |
'bgcolor' # deprecated |
|
438 |
); |
|
439 |
||
440 |
# Numbers refer to sections in HTML 4.01 standard describing the element. |
|
441 |
# See: http://www.w3.org/TR/html4/ |
|
442 |
$whitelist = array ( |
|
443 |
# 7.5.4 |
|
444 |
'div' => $block, |
|
445 |
'center' => $common, # deprecated |
|
446 |
'span' => $block, # ?? |
|
447 |
||
448 |
# 7.5.5 |
|
449 |
'h1' => $block, |
|
450 |
'h2' => $block, |
|
451 |
'h3' => $block, |
|
452 |
'h4' => $block, |
|
453 |
'h5' => $block, |
|
454 |
'h6' => $block, |
|
455 |
||
456 |
# 7.5.6 |
|
457 |
# address |
|
458 |
||
459 |
# 8.2.4 |
|
460 |
# bdo |
|
461 |
||
462 |
# 9.2.1 |
|
463 |
'em' => $common, |
|
464 |
'strong' => $common, |
|
465 |
'cite' => $common, |
|
466 |
# dfn |
|
467 |
'code' => $common, |
|
468 |
# samp |
|
469 |
# kbd |
|
470 |
'var' => $common, |
|
471 |
# abbr |
|
472 |
# acronym |
|
473 |
||
474 |
# 9.2.2 |
|
475 |
'blockquote' => array_merge( $common, array( 'cite' ) ), |
|
476 |
# q |
|
477 |
||
478 |
# 9.2.3 |
|
479 |
'sub' => $common, |
|
480 |
'sup' => $common, |
|
481 |
||
482 |
# 9.3.1 |
|
483 |
'p' => $block, |
|
484 |
||
485 |
# 9.3.2 |
|
486 |
'br' => array( 'id', 'class', 'title', 'style', 'clear' ), |
|
487 |
||
488 |
# 9.3.4 |
|
489 |
'pre' => array_merge( $common, array( 'width' ) ), |
|
490 |
||
491 |
# 9.4 |
|
492 |
'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), |
|
493 |
'del' => array_merge( $common, array( 'cite', 'datetime' ) ), |
|
494 |
||
495 |
# 10.2 |
|
496 |
'ul' => array_merge( $common, array( 'type' ) ), |
|
497 |
'ol' => array_merge( $common, array( 'type', 'start' ) ), |
|
498 |
'li' => array_merge( $common, array( 'type', 'value' ) ), |
|
499 |
||
500 |
# 10.3 |
|
501 |
'dl' => $common, |
|
502 |
'dd' => $common, |
|
503 |
'dt' => $common, |
|
504 |
||
505 |
# 11.2.1 |
|
506 |
'table' => array_merge( $common, |
|
507 |
array( 'summary', 'width', 'border', 'frame', |
|
508 |
'rules', 'cellspacing', 'cellpadding', |
|
509 |
'align', 'bgcolor', |
|
510 |
) ), |
|
511 |
||
512 |
# 11.2.2 |
|
513 |
'caption' => array_merge( $common, array( 'align' ) ), |
|
514 |
||
515 |
# 11.2.3 |
|
516 |
'thead' => array_merge( $common, $tablealign ), |
|
517 |
'tfoot' => array_merge( $common, $tablealign ), |
|
518 |
'tbody' => array_merge( $common, $tablealign ), |
|
519 |
||
520 |
# 11.2.4 |
|
521 |
'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), |
|
522 |
'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), |
|
523 |
||
524 |
# 11.2.5 |
|
525 |
'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), |
|
526 |
||
527 |
# 11.2.6 |
|
528 |
'td' => array_merge( $common, $tablecell, $tablealign ), |
|
529 |
'th' => array_merge( $common, $tablecell, $tablealign ), |
|
530 |
||
531 |
# 12.2 |
|
532 |
# added by dan |
|
533 |
'a' => array_merge( $common, array( 'href', 'name' ) ), |
|
534 |
||
535 |
# 13.2 |
|
536 |
# added by dan |
|
537 |
'img' => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ), |
|
538 |
||
539 |
# 15.2.1 |
|
540 |
'tt' => $common, |
|
541 |
'b' => $common, |
|
542 |
'i' => $common, |
|
543 |
'big' => $common, |
|
544 |
'small' => $common, |
|
545 |
'strike' => $common, |
|
546 |
's' => $common, |
|
547 |
'u' => $common, |
|
548 |
||
549 |
# 15.2.2 |
|
550 |
'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), |
|
551 |
# basefont |
|
552 |
||
553 |
# 15.3 |
|
554 |
'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), |
|
555 |
||
556 |
# XHTML Ruby annotation text module, simple ruby only. |
|
557 |
# http://www.w3c.org/TR/ruby/ |
|
558 |
'ruby' => $common, |
|
559 |
# rbc |
|
560 |
# rtc |
|
561 |
'rb' => $common, |
|
562 |
'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), |
|
563 |
'rp' => $common, |
|
564 |
||
565 |
# For compatibility with the XHTML parser. |
|
566 |
'nowiki' => array(), |
|
567 |
'noinclude' => array(), |
|
568 |
'nodisplay' => array(), |
|
569 |
||
570 |
# XHTML stuff |
|
571 |
'acronym' => $common |
|
572 |
); |
|
573 |
return $whitelist; |
|
574 |
} |
|
575 |
||
576 |
/** |
|
577 |
* Given a value escape it so that it can be used in an id attribute and |
|
578 |
* return it, this does not validate the value however (see first link) |
|
579 |
* |
|
580 |
* @link http://www.w3.org/TR/html401/types.html#type-name Valid characters |
|
581 |
* in the id and |
|
582 |
* name attributes |
|
583 |
* @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute |
|
584 |
* |
|
585 |
* @bug 4461 |
|
586 |
* |
|
587 |
* @static |
|
588 |
* |
|
589 |
* @param string $id |
|
590 |
* @return string |
|
591 |
*/ |
|
592 |
function escapeId( $id ) { |
|
593 |
static $replace = array( |
|
594 |
'%3A' => ':', |
|
595 |
'%' => '.' |
|
596 |
); |
|
597 |
||
598 |
$id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) ); |
|
599 |
||
600 |
return str_replace( array_keys( $replace ), array_values( $replace ), $id ); |
|
601 |
} |
|
602 |
||
603 |
/** |
|
604 |
* More or less "markup-safe" explode() |
|
605 |
* Ignores any instances of the separator inside <...> |
|
606 |
* @param string $separator |
|
607 |
* @param string $text |
|
608 |
* @return array |
|
609 |
*/ |
|
610 |
function wfExplodeMarkup( $separator, $text ) { |
|
611 |
$placeholder = "\x00"; |
|
612 |
||
613 |
// Just in case... |
|
614 |
$text = str_replace( $placeholder, '', $text ); |
|
615 |
||
616 |
// Trim stuff |
|
617 |
$replacer = new ReplacerCallback( $separator, $placeholder ); |
|
618 |
$cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text ); |
|
619 |
||
620 |
$items = explode( $separator, $cleaned ); |
|
621 |
foreach( $items as $i => $str ) { |
|
622 |
$items[$i] = str_replace( $placeholder, $separator, $str ); |
|
623 |
} |
|
624 |
||
625 |
return $items; |
|
626 |
} |
|
627 |
||
628 |
class ReplacerCallback { |
|
629 |
function ReplacerCallback( $from, $to ) { |
|
630 |
$this->from = $from; |
|
631 |
$this->to = $to; |
|
632 |
} |
|
633 |
||
634 |
function go( $matches ) { |
|
635 |
return str_replace( $this->from, $this->to, $matches[1] ); |
|
636 |
} |
|
637 |
} |
|
638 |
||
639 |
/** |
|
640 |
* Return an associative array of attribute names and values from |
|
641 |
* a partial tag string. Attribute names are forces to lowercase, |
|
642 |
* character references are decoded to UTF-8 text. |
|
643 |
* |
|
644 |
* @param string |
|
645 |
* @return array |
|
646 |
*/ |
|
647 |
function decodeTagAttributes( $text ) { |
|
648 |
$attribs = array(); |
|
649 |
||
650 |
if( trim( $text ) == '' ) { |
|
651 |
return $attribs; |
|
652 |
} |
|
653 |
||
654 |
$pairs = array(); |
|
655 |
if( !preg_match_all( |
|
656 |
MW_ATTRIBS_REGEX, |
|
657 |
$text, |
|
658 |
$pairs, |
|
659 |
PREG_SET_ORDER ) ) { |
|
660 |
return $attribs; |
|
661 |
} |
|
662 |
||
663 |
foreach( $pairs as $set ) { |
|
664 |
$attribute = strtolower( $set[1] ); |
|
665 |
$value = getTagAttributeCallback( $set ); |
|
666 |
||
667 |
// Normalize whitespace |
|
668 |
$value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); |
|
669 |
$value = trim( $value ); |
|
670 |
||
671 |
// Decode character references |
|
672 |
$attribs[$attribute] = decodeCharReferences( $value ); |
|
673 |
} |
|
674 |
return $attribs; |
|
675 |
} |
|
676 |
||
677 |
/** |
|
678 |
* Pick the appropriate attribute value from a match set from the |
|
679 |
* MW_ATTRIBS_REGEX matches. |
|
680 |
* |
|
681 |
* @param array $set |
|
682 |
* @return string |
|
683 |
* @access private |
|
684 |
*/ |
|
685 |
function getTagAttributeCallback( $set ) { |
|
686 |
if( isset( $set[6] ) ) { |
|
687 |
# Illegal #XXXXXX color with no quotes. |
|
688 |
return $set[6]; |
|
689 |
} elseif( isset( $set[5] ) ) { |
|
690 |
# No quotes. |
|
691 |
return $set[5]; |
|
692 |
} elseif( isset( $set[4] ) ) { |
|
693 |
# Single-quoted |
|
694 |
return $set[4]; |
|
695 |
} elseif( isset( $set[3] ) ) { |
|
696 |
# Double-quoted |
|
697 |
return $set[3]; |
|
698 |
} elseif( !isset( $set[2] ) ) { |
|
699 |
# In XHTML, attributes must have a value. |
|
700 |
# For 'reduced' form, return explicitly the attribute name here. |
|
701 |
return $set[1]; |
|
702 |
} else { |
|
703 |
die_friendly('Parser error', "<p>Tag conditions not met. This should never happen and is a bug.</p>" ); |
|
704 |
} |
|
705 |
} |
|
706 |
||
707 |
/** |
|
708 |
* Strips and renders nowiki, pre, math, hiero |
|
709 |
* If $render is set, performs necessary rendering operations on plugins |
|
710 |
* Returns the text, and fills an array with data needed in unstrip() |
|
711 |
* If the $state is already a valid strip state, it adds to the state |
|
712 |
* |
|
713 |
* @param bool $stripcomments when set, HTML comments <!-- like this --> |
|
714 |
* will be stripped in addition to other tags. This is important |
|
715 |
* for section editing, where these comments cause confusion when |
|
716 |
* counting the sections in the wikisource |
|
717 |
* |
|
718 |
* @param array dontstrip contains tags which should not be stripped; |
|
719 |
* used to prevent stipping of <gallery> when saving (fixes bug 2700) |
|
720 |
* |
|
721 |
* @access private |
|
722 |
*/ |
|
723 |
function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) { |
|
724 |
global $wgRandomKey; |
|
725 |
$render = true; |
|
726 |
||
727 |
$wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff)); |
|
728 |
$uniq_prefix =& $wgRandomKey; |
|
729 |
$commentState = array(); |
|
730 |
||
731 |
$elements = array( 'nowiki', 'gallery' ); |
|
732 |
||
733 |
# Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700) |
|
734 |
foreach ( $elements AS $k => $v ) { |
|
735 |
if ( !in_array ( $v , $dontstrip ) ) continue; |
|
736 |
unset ( $elements[$k] ); |
|
737 |
} |
|
738 |
||
739 |
$matches = array(); |
|
740 |
$text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix ); |
|
741 |
||
742 |
foreach( $matches as $marker => $data ) { |
|
743 |
list( $element, $content, $params, $tag ) = $data; |
|
744 |
if( $render ) { |
|
745 |
$tagName = strtolower( $element ); |
|
746 |
switch( $tagName ) { |
|
747 |
case '!--': |
|
748 |
// Comment |
|
749 |
if( substr( $tag, -3 ) == '-->' ) { |
|
750 |
$output = $tag; |
|
751 |
} else { |
|
752 |
// Unclosed comment in input. |
|
753 |
// Close it so later stripping can remove it |
|
754 |
$output = "$tag-->"; |
|
755 |
} |
|
756 |
break; |
|
757 |
case 'html': |
|
758 |
if( $wgRawHtml ) { |
|
759 |
$output = $content; |
|
760 |
break; |
|
761 |
} |
|
762 |
// Shouldn't happen otherwise. :) |
|
763 |
case 'nowiki': |
|
764 |
$output = wfEscapeHTMLTagsOnly( $content ); |
|
765 |
break; |
|
766 |
default: |
|
767 |
} |
|
768 |
} else { |
|
769 |
// Just stripping tags; keep the source |
|
770 |
$output = $tag; |
|
771 |
} |
|
772 |
||
773 |
// Unstrip the output, because unstrip() is no longer recursive so |
|
774 |
// it won't do it itself |
|
775 |
$output = unstrip( $output, $state ); |
|
776 |
||
777 |
if( !$stripcomments && $element == '!--' ) { |
|
778 |
$commentState[$marker] = $output; |
|
779 |
} elseif ( $element == 'html' || $element == 'nowiki' ) { |
|
780 |
$state['nowiki'][$marker] = $output; |
|
781 |
} else { |
|
782 |
$state['general'][$marker] = $output; |
|
783 |
} |
|
784 |
} |
|
785 |
||
786 |
# Unstrip comments unless explicitly told otherwise. |
|
787 |
# (The comments are always stripped prior to this point, so as to |
|
788 |
# not invoke any extension tags / parser hooks contained within |
|
789 |
# a comment.) |
|
790 |
if ( !$stripcomments ) { |
|
791 |
// Put them all back and forget them |
|
792 |
$text = strtr( $text, $commentState ); |
|
793 |
} |
|
794 |
||
795 |
return $text; |
|
796 |
} |
|
797 |
||
798 |
/** |
|
799 |
* Replaces all occurrences of HTML-style comments and the given tags |
|
800 |
* in the text with a random marker and returns teh next text. The output |
|
801 |
* parameter $matches will be an associative array filled with data in |
|
802 |
* the form: |
|
803 |
* 'UNIQ-xxxxx' => array( |
|
804 |
* 'element', |
|
805 |
* 'tag content', |
|
806 |
* array( 'param' => 'x' ), |
|
807 |
* '<element param="x">tag content</element>' ) ) |
|
808 |
* |
|
809 |
* @param $elements list of element names. Comments are always extracted. |
|
810 |
* @param $text Source text string. |
|
811 |
* @param $uniq_prefix |
|
812 |
* |
|
813 |
* @access private |
|
814 |
* @static |
|
815 |
*/ |
|
816 |
function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){ |
|
817 |
static $n = 1; |
|
818 |
$stripped = ''; |
|
819 |
$matches = array(); |
|
820 |
||
821 |
$taglist = implode( '|', $elements ); |
|
822 |
$start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i"; |
|
823 |
||
824 |
while ( '' != $text ) { |
|
825 |
$p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE ); |
|
826 |
$stripped .= $p[0]; |
|
827 |
if( count( $p ) < 5 ) { |
|
828 |
break; |
|
829 |
} |
|
830 |
if( count( $p ) > 5 ) { |
|
831 |
// comment |
|
832 |
$element = $p[4]; |
|
833 |
$attributes = ''; |
|
834 |
$close = ''; |
|
835 |
$inside = $p[5]; |
|
836 |
} else { |
|
837 |
// tag |
|
838 |
$element = $p[1]; |
|
839 |
$attributes = $p[2]; |
|
840 |
$close = $p[3]; |
|
841 |
$inside = $p[4]; |
|
842 |
} |
|
843 |
||
844 |
$marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU'; |
|
845 |
$stripped .= $marker; |
|
846 |
||
847 |
if ( $close === '/>' ) { |
|
848 |
// Empty element tag, <tag /> |
|
849 |
$content = null; |
|
850 |
$text = $inside; |
|
851 |
$tail = null; |
|
852 |
} else { |
|
853 |
if( $element == '!--' ) { |
|
854 |
$end = '/(-->)/'; |
|
855 |
} else { |
|
856 |
$end = "/(<\\/$element\\s*>)/i"; |
|
857 |
} |
|
858 |
$q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE ); |
|
859 |
$content = $q[0]; |
|
860 |
if( count( $q ) < 3 ) { |
|
861 |
# No end tag -- let it run out to the end of the text. |
|
862 |
$tail = ''; |
|
863 |
$text = ''; |
|
864 |
} else { |
|
865 |
$tail = $q[1]; |
|
866 |
$text = $q[2]; |
|
867 |
} |
|
868 |
} |
|
869 |
||
870 |
$matches[$marker] = array( $element, |
|
871 |
$content, |
|
872 |
decodeTagAttributes( $attributes ), |
|
873 |
"<$element$attributes$close$content$tail" ); |
|
874 |
} |
|
875 |
return $stripped; |
|
876 |
} |
|
877 |
||
878 |
/** |
|
879 |
* Escape html tags |
|
880 |
* Basically replacing " > and < with HTML entities ( ", >, <) |
|
881 |
* |
|
882 |
* @param $in String: text that might contain HTML tags. |
|
883 |
* @return string Escaped string |
|
884 |
*/ |
|
885 |
function wfEscapeHTMLTagsOnly( $in ) { |
|
886 |
return str_replace( |
|
887 |
array( '"', '>', '<' ), |
|
888 |
array( '"', '>', '<' ), |
|
889 |
$in ); |
|
890 |
} |
|
891 |
||
892 |
/** |
|
893 |
* Restores pre, math, and other extensions removed by strip() |
|
894 |
* |
|
895 |
* always call unstripNoWiki() after this one |
|
896 |
* @private |
|
897 |
*/ |
|
898 |
function unstrip( $text, &$state ) { |
|
899 |
if ( !isset( $state['general'] ) ) { |
|
900 |
return $text; |
|
901 |
} |
|
902 |
||
903 |
# TODO: good candidate for FSS |
|
904 |
$text = strtr( $text, $state['general'] ); |
|
905 |
||
906 |
return $text; |
|
907 |
} |
|
908 |
||
909 |
/** |
|
910 |
* Return UTF-8 string for a codepoint if that is a valid |
|
911 |
* character reference, otherwise U+FFFD REPLACEMENT CHARACTER. |
|
912 |
* @param int $codepoint |
|
913 |
* @return string |
|
914 |
* @private |
|
915 |
*/ |
|
916 |
function decodeChar( $codepoint ) { |
|
917 |
if( validateCodepoint( $codepoint ) ) { |
|
918 |
return codepointToUtf8( $codepoint ); |
|
919 |
} else { |
|
920 |
return UTF8_REPLACEMENT; |
|
921 |
} |
|
922 |
} |
|
923 |
||
924 |
/** |
|
925 |
* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, |
|
926 |
* return the UTF-8 encoding of that character. Otherwise, returns |
|
927 |
* pseudo-entity source (eg &foo;) |
|
928 |
* |
|
929 |
* @param string $name |
|
930 |
* @return string |
|
931 |
*/ |
|
932 |
function decodeEntity( $name ) { |
|
933 |
global $wgHtmlEntities; |
|
934 |
if( isset( $wgHtmlEntities[$name] ) ) { |
|
935 |
return codepointToUtf8( $wgHtmlEntities[$name] ); |
|
936 |
} else { |
|
937 |
return "&$name;"; |
|
938 |
} |
|
939 |
} |
|
940 |
||
941 |
/** |
|
942 |
* Returns true if a given Unicode codepoint is a valid character in XML. |
|
943 |
* @param int $codepoint |
|
944 |
* @return bool |
|
945 |
*/ |
|
946 |
function validateCodepoint( $codepoint ) { |
|
947 |
return ($codepoint == 0x09) |
|
948 |
|| ($codepoint == 0x0a) |
|
949 |
|| ($codepoint == 0x0d) |
|
950 |
|| ($codepoint >= 0x20 && $codepoint <= 0xd7ff) |
|
951 |
|| ($codepoint >= 0xe000 && $codepoint <= 0xfffd) |
|
952 |
|| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); |
|
953 |
} |
|
954 |
||
955 |
/** |
|
956 |
* Return UTF-8 sequence for a given Unicode code point. |
|
957 |
* May die if fed out of range data. |
|
958 |
* |
|
959 |
* @param $codepoint Integer: |
|
960 |
* @return String |
|
961 |
* @public |
|
962 |
*/ |
|
963 |
function codepointToUtf8( $codepoint ) { |
|
964 |
if($codepoint < 0x80) return chr($codepoint); |
|
965 |
if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) . |
|
966 |
chr($codepoint & 0x3f | 0x80); |
|
967 |
if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) . |
|
968 |
chr($codepoint >> 6 & 0x3f | 0x80) . |
|
969 |
chr($codepoint & 0x3f | 0x80); |
|
970 |
if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) . |
|
971 |
chr($codepoint >> 12 & 0x3f | 0x80) . |
|
972 |
chr($codepoint >> 6 & 0x3f | 0x80) . |
|
973 |
chr($codepoint & 0x3f | 0x80); |
|
974 |
||
975 |
echo "Asked for code outside of range ($codepoint)\n"; |
|
976 |
die( -1 ); |
|
977 |
} |
|
978 |
||
979 |
/** |
|
980 |
* @param string $matches |
|
981 |
* @return string |
|
982 |
*/ |
|
983 |
function decodeCharReferencesCallback( $matches ) { |
|
984 |
if( $matches[1] != '' ) { |
|
24 | 985 |
return decodeEntity( $matches[1] ); |
1 | 986 |
} elseif( $matches[2] != '' ) { |
24 | 987 |
return decodeChar( intval( $matches[2] ) ); |
1 | 988 |
} elseif( $matches[3] != '' ) { |
24 | 989 |
return decodeChar( hexdec( $matches[3] ) ); |
1 | 990 |
} elseif( $matches[4] != '' ) { |
24 | 991 |
return decodeChar( hexdec( $matches[4] ) ); |
1 | 992 |
} |
993 |
# Last case should be an ampersand by itself |
|
994 |
return $matches[0]; |
|
995 |
} |
|
996 |
||
997 |
?> |