00001 <?php
00031 define( 'MW_CHAR_REFS_REGEX',
00032 '/&([A-Za-z0-9\x80-\xff]+);
00033 |&\#([0-9]+);
00034 |&\#x([0-9A-Za-z]+);
00035 |&\#X([0-9A-Za-z]+);
00036 |(&)/x' );
00037
00043 $attrib = '[A-Za-z0-9]';
00044 $space = '[\x09\x0a\x0d\x20]';
00045 define( 'MW_ATTRIBS_REGEX',
00046 "/(?:^|$space)($attrib+)
00047 ($space*=$space*
00048 (?:
00049 # The attribute value: quoted or alone
00050 \"([^<\"]*)\"
00051 | '([^<']*)'
00052 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00053 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00054 # colors are specified like this.
00055 # We'll be normalizing it.
00056 )
00057 )?(?=$space|\$)/sx" );
00058
00064 global $wgHtmlEntities;
00065 $wgHtmlEntities = array(
00066 'Aacute' => 193,
00067 'aacute' => 225,
00068 'Acirc' => 194,
00069 'acirc' => 226,
00070 'acute' => 180,
00071 'AElig' => 198,
00072 'aelig' => 230,
00073 'Agrave' => 192,
00074 'agrave' => 224,
00075 'alefsym' => 8501,
00076 'Alpha' => 913,
00077 'alpha' => 945,
00078 'amp' => 38,
00079 'and' => 8743,
00080 'ang' => 8736,
00081 'Aring' => 197,
00082 'aring' => 229,
00083 'asymp' => 8776,
00084 'Atilde' => 195,
00085 'atilde' => 227,
00086 'Auml' => 196,
00087 'auml' => 228,
00088 'bdquo' => 8222,
00089 'Beta' => 914,
00090 'beta' => 946,
00091 'brvbar' => 166,
00092 'bull' => 8226,
00093 'cap' => 8745,
00094 'Ccedil' => 199,
00095 'ccedil' => 231,
00096 'cedil' => 184,
00097 'cent' => 162,
00098 'Chi' => 935,
00099 'chi' => 967,
00100 'circ' => 710,
00101 'clubs' => 9827,
00102 'cong' => 8773,
00103 'copy' => 169,
00104 'crarr' => 8629,
00105 'cup' => 8746,
00106 'curren' => 164,
00107 'dagger' => 8224,
00108 'Dagger' => 8225,
00109 'darr' => 8595,
00110 'dArr' => 8659,
00111 'deg' => 176,
00112 'Delta' => 916,
00113 'delta' => 948,
00114 'diams' => 9830,
00115 'divide' => 247,
00116 'Eacute' => 201,
00117 'eacute' => 233,
00118 'Ecirc' => 202,
00119 'ecirc' => 234,
00120 'Egrave' => 200,
00121 'egrave' => 232,
00122 'empty' => 8709,
00123 'emsp' => 8195,
00124 'ensp' => 8194,
00125 'Epsilon' => 917,
00126 'epsilon' => 949,
00127 'equiv' => 8801,
00128 'Eta' => 919,
00129 'eta' => 951,
00130 'ETH' => 208,
00131 'eth' => 240,
00132 'Euml' => 203,
00133 'euml' => 235,
00134 'euro' => 8364,
00135 'exist' => 8707,
00136 'fnof' => 402,
00137 'forall' => 8704,
00138 'frac12' => 189,
00139 'frac14' => 188,
00140 'frac34' => 190,
00141 'frasl' => 8260,
00142 'Gamma' => 915,
00143 'gamma' => 947,
00144 'ge' => 8805,
00145 'gt' => 62,
00146 'harr' => 8596,
00147 'hArr' => 8660,
00148 'hearts' => 9829,
00149 'hellip' => 8230,
00150 'Iacute' => 205,
00151 'iacute' => 237,
00152 'Icirc' => 206,
00153 'icirc' => 238,
00154 'iexcl' => 161,
00155 'Igrave' => 204,
00156 'igrave' => 236,
00157 'image' => 8465,
00158 'infin' => 8734,
00159 'int' => 8747,
00160 'Iota' => 921,
00161 'iota' => 953,
00162 'iquest' => 191,
00163 'isin' => 8712,
00164 'Iuml' => 207,
00165 'iuml' => 239,
00166 'Kappa' => 922,
00167 'kappa' => 954,
00168 'Lambda' => 923,
00169 'lambda' => 955,
00170 'lang' => 9001,
00171 'laquo' => 171,
00172 'larr' => 8592,
00173 'lArr' => 8656,
00174 'lceil' => 8968,
00175 'ldquo' => 8220,
00176 'le' => 8804,
00177 'lfloor' => 8970,
00178 'lowast' => 8727,
00179 'loz' => 9674,
00180 'lrm' => 8206,
00181 'lsaquo' => 8249,
00182 'lsquo' => 8216,
00183 'lt' => 60,
00184 'macr' => 175,
00185 'mdash' => 8212,
00186 'micro' => 181,
00187 'middot' => 183,
00188 'minus' => 8722,
00189 'Mu' => 924,
00190 'mu' => 956,
00191 'nabla' => 8711,
00192 'nbsp' => 160,
00193 'ndash' => 8211,
00194 'ne' => 8800,
00195 'ni' => 8715,
00196 'not' => 172,
00197 'notin' => 8713,
00198 'nsub' => 8836,
00199 'Ntilde' => 209,
00200 'ntilde' => 241,
00201 'Nu' => 925,
00202 'nu' => 957,
00203 'Oacute' => 211,
00204 'oacute' => 243,
00205 'Ocirc' => 212,
00206 'ocirc' => 244,
00207 'OElig' => 338,
00208 'oelig' => 339,
00209 'Ograve' => 210,
00210 'ograve' => 242,
00211 'oline' => 8254,
00212 'Omega' => 937,
00213 'omega' => 969,
00214 'Omicron' => 927,
00215 'omicron' => 959,
00216 'oplus' => 8853,
00217 'or' => 8744,
00218 'ordf' => 170,
00219 'ordm' => 186,
00220 'Oslash' => 216,
00221 'oslash' => 248,
00222 'Otilde' => 213,
00223 'otilde' => 245,
00224 'otimes' => 8855,
00225 'Ouml' => 214,
00226 'ouml' => 246,
00227 'para' => 182,
00228 'part' => 8706,
00229 'permil' => 8240,
00230 'perp' => 8869,
00231 'Phi' => 934,
00232 'phi' => 966,
00233 'Pi' => 928,
00234 'pi' => 960,
00235 'piv' => 982,
00236 'plusmn' => 177,
00237 'pound' => 163,
00238 'prime' => 8242,
00239 'Prime' => 8243,
00240 'prod' => 8719,
00241 'prop' => 8733,
00242 'Psi' => 936,
00243 'psi' => 968,
00244 'quot' => 34,
00245 'radic' => 8730,
00246 'rang' => 9002,
00247 'raquo' => 187,
00248 'rarr' => 8594,
00249 'rArr' => 8658,
00250 'rceil' => 8969,
00251 'rdquo' => 8221,
00252 'real' => 8476,
00253 'reg' => 174,
00254 'rfloor' => 8971,
00255 'Rho' => 929,
00256 'rho' => 961,
00257 'rlm' => 8207,
00258 'rsaquo' => 8250,
00259 'rsquo' => 8217,
00260 'sbquo' => 8218,
00261 'Scaron' => 352,
00262 'scaron' => 353,
00263 'sdot' => 8901,
00264 'sect' => 167,
00265 'shy' => 173,
00266 'Sigma' => 931,
00267 'sigma' => 963,
00268 'sigmaf' => 962,
00269 'sim' => 8764,
00270 'spades' => 9824,
00271 'sub' => 8834,
00272 'sube' => 8838,
00273 'sum' => 8721,
00274 'sup' => 8835,
00275 'sup1' => 185,
00276 'sup2' => 178,
00277 'sup3' => 179,
00278 'supe' => 8839,
00279 'szlig' => 223,
00280 'Tau' => 932,
00281 'tau' => 964,
00282 'there4' => 8756,
00283 'Theta' => 920,
00284 'theta' => 952,
00285 'thetasym' => 977,
00286 'thinsp' => 8201,
00287 'THORN' => 222,
00288 'thorn' => 254,
00289 'tilde' => 732,
00290 'times' => 215,
00291 'trade' => 8482,
00292 'Uacute' => 218,
00293 'uacute' => 250,
00294 'uarr' => 8593,
00295 'uArr' => 8657,
00296 'Ucirc' => 219,
00297 'ucirc' => 251,
00298 'Ugrave' => 217,
00299 'ugrave' => 249,
00300 'uml' => 168,
00301 'upsih' => 978,
00302 'Upsilon' => 933,
00303 'upsilon' => 965,
00304 'Uuml' => 220,
00305 'uuml' => 252,
00306 'weierp' => 8472,
00307 'Xi' => 926,
00308 'xi' => 958,
00309 'Yacute' => 221,
00310 'yacute' => 253,
00311 'yen' => 165,
00312 'Yuml' => 376,
00313 'yuml' => 255,
00314 'Zeta' => 918,
00315 'zeta' => 950,
00316 'zwj' => 8205,
00317 'zwnj' => 8204 );
00318
00322 global $wgHtmlEntityAliases;
00323 $wgHtmlEntityAliases = array(
00324 'רלמ' => 'rlm',
00325 'رلم' => 'rlm',
00326 );
00327
00328
00333 class Sanitizer {
00343 static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array() ) {
00344 global $wgUseTidy;
00345
00346 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00347 $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
00348
00349 wfProfileIn( __METHOD__ );
00350
00351 if ( !$staticInitialised ) {
00352
00353 $htmlpairs = array_merge( $extratags, array( # Tags that must be closed
00354 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00355 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00356 'strike', 'strong', 'tt', 'var', 'div', 'center',
00357 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00358 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
00359 ) );
00360 $htmlsingle = array(
00361 'br', 'hr', 'li', 'dt', 'dd'
00362 );
00363 $htmlsingleonly = array( # Elements that cannot have close tags
00364 'br', 'hr'
00365 );
00366 $htmlnest = array( # Tags that can be nested--??
00367 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00368 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
00369 );
00370 $tabletags = array( # Can only appear inside table, we will close them
00371 'td', 'th', 'tr',
00372 );
00373 $htmllist = array( # Tags used by list
00374 'ul','ol',
00375 );
00376 $listtags = array( # Tags that can appear in a list
00377 'li',
00378 );
00379
00380 $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
00381 $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
00382
00383 # Convert them all to hashtables for faster lookup
00384 $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00385 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
00386 foreach ( $vars as $var ) {
00387 $$var = array_flip( $$var );
00388 }
00389 $staticInitialised = true;
00390 }
00391
00392 # Remove HTML comments
00393 $text = Sanitizer::removeHTMLcomments( $text );
00394 $bits = explode( '<', $text );
00395 $text = str_replace( '>', '>', array_shift( $bits ) );
00396 if(!$wgUseTidy) {
00397 $tagstack = $tablestack = array();
00398 foreach ( $bits as $x ) {
00399 $regs = array();
00400 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00401 list( , $slash, $t, $params, $brace, $rest ) = $regs;
00402 } else {
00403 $slash = $t = $params = $brace = $rest = null;
00404 }
00405
00406 $badtag = 0 ;
00407 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00408 # Check our stack
00409 if ( $slash ) {
00410 # Closing a tag...
00411 if( isset( $htmlsingleonly[$t] ) ) {
00412 $badtag = 1;
00413 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
00414 if ( isset( $htmlsingleallowed[$ot] ) ) {
00415 # Pop all elements with an optional close tag
00416 # and see if we find a match below them
00417 $optstack = array();
00418 array_push ($optstack, $ot);
00419 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
00420 isset( $htmlsingleallowed[$ot] ) )
00421 {
00422 array_push ($optstack, $ot);
00423 }
00424 if ( $t != $ot ) {
00425 # No match. Push the optinal elements back again
00426 $badtag = 1;
00427 while ( $ot = @array_pop( $optstack ) ) {
00428 array_push( $tagstack, $ot );
00429 }
00430 }
00431 } else {
00432 @array_push( $tagstack, $ot );
00433 # <li> can be nested in <ul> or <ol>, skip those cases:
00434 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
00435 $badtag = 1;
00436 }
00437 }
00438 } else {
00439 if ( $t == 'table' ) {
00440 $tagstack = array_pop( $tablestack );
00441 }
00442 }
00443 $newparams = '';
00444 } else {
00445 # Keep track for later
00446 if ( isset( $tabletags[$t] ) &&
00447 ! in_array( 'table', $tagstack ) ) {
00448 $badtag = 1;
00449 } else if ( in_array( $t, $tagstack ) &&
00450 ! isset( $htmlnest [$t ] ) ) {
00451 $badtag = 1 ;
00452 # Is it a self closed htmlpair ? (bug 5487)
00453 } else if( $brace == '/>' &&
00454 isset( $htmlpairs[$t] ) ) {
00455 $badtag = 1;
00456 } elseif( isset( $htmlsingleonly[$t] ) ) {
00457 # Hack to force empty tag for uncloseable elements
00458 $brace = '/>';
00459 } else if( isset( $htmlsingle[$t] ) ) {
00460 # Hack to not close $htmlsingle tags
00461 $brace = NULL;
00462 } else if( isset( $tabletags[$t] )
00463 && in_array($t ,$tagstack) ) {
00464
00465 $text .= "</$t>";
00466 } else {
00467 if ( $t == 'table' ) {
00468 array_push( $tablestack, $tagstack );
00469 $tagstack = array();
00470 }
00471 array_push( $tagstack, $t );
00472 }
00473
00474 # Replace any variables or template parameters with
00475 # plaintext results.
00476 if( is_callable( $processCallback ) ) {
00477 call_user_func_array( $processCallback, array( &$params, $args ) );
00478 }
00479
00480 # Strip non-approved attributes from the tag
00481 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00482 }
00483 if ( ! $badtag ) {
00484 $rest = str_replace( '>', '>', $rest );
00485 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00486 $text .= "<$slash$t$newparams$close>$rest";
00487 continue;
00488 }
00489 }
00490 $text .= '<' . str_replace( '>', '>', $x);
00491 }
00492 # Close off any remaining tags
00493 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
00494 $text .= "</$t>\n";
00495 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
00496 }
00497 } else {
00498 # this might be possible using tidy itself
00499 foreach ( $bits as $x ) {
00500 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00501 $x, $regs );
00502 @list( , $slash, $t, $params, $brace, $rest ) = $regs;
00503 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00504 if( is_callable( $processCallback ) ) {
00505 call_user_func_array( $processCallback, array( &$params, $args ) );
00506 }
00507 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00508 $rest = str_replace( '>', '>', $rest );
00509 $text .= "<$slash$t$newparams$brace$rest";
00510 } else {
00511 $text .= '<' . str_replace( '>', '>', $x);
00512 }
00513 }
00514 }
00515 wfProfileOut( __METHOD__ );
00516 return $text;
00517 }
00518
00529 static function removeHTMLcomments( $text ) {
00530 wfProfileIn( __METHOD__ );
00531 while (($start = strpos($text, '<!--')) !== false) {
00532 $end = strpos($text, '-->', $start + 4);
00533 if ($end === false) {
00534 # Unterminated comment; bail out
00535 break;
00536 }
00537
00538 $end += 3;
00539
00540 # Trim space and newline if the comment is both
00541 # preceded and followed by a newline
00542 $spaceStart = max($start - 1, 0);
00543 $spaceLen = $end - $spaceStart;
00544 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
00545 $spaceStart--;
00546 $spaceLen++;
00547 }
00548 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
00549 $spaceLen++;
00550 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
00551 # Remove the comment, leading and trailing
00552 # spaces, and leave only one newline.
00553 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
00554 }
00555 else {
00556 # Remove just the comment.
00557 $text = substr_replace($text, '', $start, $end - $start);
00558 }
00559 }
00560 wfProfileOut( __METHOD__ );
00561 return $text;
00562 }
00563
00579 static function validateTagAttributes( $attribs, $element ) {
00580 return Sanitizer::validateAttributes( $attribs,
00581 Sanitizer::attributeWhitelist( $element ) );
00582 }
00583
00599 static function validateAttributes( $attribs, $whitelist ) {
00600 $whitelist = array_flip( $whitelist );
00601 $out = array();
00602 foreach( $attribs as $attribute => $value ) {
00603 if( !isset( $whitelist[$attribute] ) ) {
00604 continue;
00605 }
00606 # Strip javascript "expression" from stylesheets.
00607 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00608 if( $attribute == 'style' ) {
00609 $value = Sanitizer::checkCss( $value );
00610 }
00611
00612 if ( $attribute === 'id' ) {
00613 global $wgEnforceHtmlIds;
00614 $value = Sanitizer::escapeId( $value,
00615 $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
00616 }
00617
00618
00619
00620 $out[$attribute] = $value;
00621 }
00622 return $out;
00623 }
00624
00635 static function mergeAttributes( $a, $b ) {
00636 $out = array_merge( $a, $b );
00637 if( isset( $a['class'] ) && isset( $b['class'] )
00638 && is_string( $a['class'] ) && is_string( $b['class'] )
00639 && $a['class'] !== $b['class'] ) {
00640 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00641 -1, PREG_SPLIT_NO_EMPTY );
00642 $out['class'] = implode( ' ', array_unique( $classes ) );
00643 }
00644 return $out;
00645 }
00646
00656 static function checkCss( $value ) {
00657 $value = Sanitizer::decodeCharReferences( $value );
00658
00659
00660 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
00661
00662
00663
00664 static $decodeRegex, $reencodeTable;
00665 if ( !$decodeRegex ) {
00666 $space = '[\\x20\\t\\r\\n\\f]';
00667 $nl = '(?:\\n|\\r\\n|\\r|\\f)';
00668 $backslash = '\\\\';
00669 $decodeRegex = "/ $backslash
00670 (?:
00671 ($nl) | # 1. Line continuation
00672 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
00673 (.) | # 3. backslash cancelling special meaning
00674 () | # 4. backslash at end of string
00675 )/xu";
00676 }
00677 $value = preg_replace_callback( $decodeRegex,
00678 array( __CLASS__, 'cssDecodeCallback' ), $value );
00679
00680
00681 if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
00682 return '/* invalid control char */';
00683 } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( !ix', $value ) ) {
00684 return '/* insecure input */';
00685 }
00686 return $value;
00687 }
00688
00689 static function cssDecodeCallback( $matches ) {
00690 if ( $matches[1] !== '' ) {
00691
00692 return '';
00693 } elseif ( $matches[2] !== '' ) {
00694 $char = codepointToUtf8( hexdec( $matches[2] ) );
00695 } elseif ( $matches[3] !== '' ) {
00696 $char = $matches[3];
00697 } else {
00698 $char = '\\';
00699 }
00700 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
00701
00702
00703 return '\\' . dechex( ord( $char ) ) . ' ';
00704 } else {
00705
00706 return $char;
00707 }
00708 }
00709
00729 static function fixTagAttributes( $text, $element ) {
00730 if( trim( $text ) == '' ) {
00731 return '';
00732 }
00733
00734 $stripped = Sanitizer::validateTagAttributes(
00735 Sanitizer::decodeTagAttributes( $text ), $element );
00736
00737 $attribs = array();
00738 foreach( $stripped as $attribute => $value ) {
00739 $encAttribute = htmlspecialchars( $attribute );
00740 $encValue = Sanitizer::safeEncodeAttribute( $value );
00741
00742 $attribs[] = "$encAttribute=\"$encValue\"";
00743 }
00744 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
00745 }
00746
00752 static function encodeAttribute( $text ) {
00753 $encValue = htmlspecialchars( $text, ENT_QUOTES );
00754
00755
00756
00757
00758 $encValue = strtr( $encValue, array(
00759 "\n" => ' ',
00760 "\r" => ' ',
00761 "\t" => '	',
00762 ) );
00763
00764 return $encValue;
00765 }
00766
00773 static function safeEncodeAttribute( $text ) {
00774 $encValue = Sanitizer::encodeAttribute( $text );
00775
00776 # Templates and links may be expanded in later parsing,
00777 # creating invalid or dangerous output. Suppress this.
00778 $encValue = strtr( $encValue, array(
00779 '<' => '<',
00780 '>' => '>',
00781 '"' => '"',
00782 '{' => '{',
00783 '[' => '[',
00784 "''" => '''',
00785 'ISBN' => 'ISBN',
00786 'RFC' => 'RFC',
00787 'PMID' => 'PMID',
00788 '|' => '|',
00789 '__' => '__',
00790 ) );
00791
00792 # Stupid hack
00793 $encValue = preg_replace_callback(
00794 '/(' . wfUrlProtocols() . ')/',
00795 array( 'Sanitizer', 'armorLinksCallback' ),
00796 $encValue );
00797 return $encValue;
00798 }
00799
00821 static function escapeId( $id, $options = array() ) {
00822 $options = (array)$options;
00823
00824 if ( !in_array( 'xml', $options ) ) {
00825 # HTML4-style escaping
00826 static $replace = array(
00827 '%3A' => ':',
00828 '%' => '.'
00829 );
00830
00831 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
00832 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
00833
00834 if ( !preg_match( '/^[a-zA-Z]/', $id )
00835 && !in_array( 'noninitial', $options ) ) {
00836
00837 $id = "x$id";
00838 }
00839 return $id;
00840 }
00841
00842 # XML-style escaping. For the patterns used, see the XML 1.0 standard,
00843 # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
00844 $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
00845 . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
00846 . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
00847 $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
00848 . '\x{203F}-\x{2040}';
00849 # Replace _ as well so we don't get multiple consecutive underscores
00850 $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
00851 $id = trim( $id, '_' );
00852
00853 if ( !preg_match( "/^[$nameStartChar]/u", $id )
00854 && !in_array( 'noninitial', $options ) ) {
00855 $id = "_$id";
00856 }
00857
00858 return $id;
00859 }
00860
00872 static function escapeClass( $class ) {
00873
00874 return rtrim(preg_replace(
00875 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
00876 '_',
00877 $class ), '_');
00878 }
00879
00887 static function escapeHtmlAllowEntities( $html ) {
00888 # It seems wise to escape ' as well as ", as a matter of course. Can't
00889 # hurt.
00890 $html = htmlspecialchars( $html, ENT_QUOTES );
00891 $html = str_replace( '&', '&', $html );
00892 $html = Sanitizer::normalizeCharReferences( $html );
00893 return $html;
00894 }
00895
00902 private static function armorLinksCallback( $matches ) {
00903 return str_replace( ':', ':', $matches[1] );
00904 }
00905
00914 public static function decodeTagAttributes( $text ) {
00915 $attribs = array();
00916
00917 if( trim( $text ) == '' ) {
00918 return $attribs;
00919 }
00920
00921 $pairs = array();
00922 if( !preg_match_all(
00923 MW_ATTRIBS_REGEX,
00924 $text,
00925 $pairs,
00926 PREG_SET_ORDER ) ) {
00927 return $attribs;
00928 }
00929
00930 foreach( $pairs as $set ) {
00931 $attribute = strtolower( $set[1] );
00932 $value = Sanitizer::getTagAttributeCallback( $set );
00933
00934
00935 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
00936 $value = trim( $value );
00937
00938
00939 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
00940 }
00941 return $attribs;
00942 }
00943
00952 private static function getTagAttributeCallback( $set ) {
00953 if( isset( $set[6] ) ) {
00954 # Illegal #XXXXXX color with no quotes.
00955 return $set[6];
00956 } elseif( isset( $set[5] ) ) {
00957 # No quotes.
00958 return $set[5];
00959 } elseif( isset( $set[4] ) ) {
00960 # Single-quoted
00961 return $set[4];
00962 } elseif( isset( $set[3] ) ) {
00963 # Double-quoted
00964 return $set[3];
00965 } elseif( !isset( $set[2] ) ) {
00966 # In XHTML, attributes must have a value.
00967 # For 'reduced' form, return explicitly the attribute name here.
00968 return $set[1];
00969 } else {
00970 throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
00971 }
00972 }
00973
00986 private static function normalizeAttributeValue( $text ) {
00987 return str_replace( '"', '"',
00988 self::normalizeWhitespace(
00989 Sanitizer::normalizeCharReferences( $text ) ) );
00990 }
00991
00992 private static function normalizeWhitespace( $text ) {
00993 return preg_replace(
00994 '/\r\n|[\x20\x0d\x0a\x09]/',
00995 ' ',
00996 $text );
00997 }
00998
01013 static function normalizeCharReferences( $text ) {
01014 return preg_replace_callback(
01015 MW_CHAR_REFS_REGEX,
01016 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
01017 $text );
01018 }
01023 static function normalizeCharReferencesCallback( $matches ) {
01024 $ret = null;
01025 if( $matches[1] != '' ) {
01026 $ret = Sanitizer::normalizeEntity( $matches[1] );
01027 } elseif( $matches[2] != '' ) {
01028 $ret = Sanitizer::decCharReference( $matches[2] );
01029 } elseif( $matches[3] != '' ) {
01030 $ret = Sanitizer::hexCharReference( $matches[3] );
01031 } elseif( $matches[4] != '' ) {
01032 $ret = Sanitizer::hexCharReference( $matches[4] );
01033 }
01034 if( is_null( $ret ) ) {
01035 return htmlspecialchars( $matches[0] );
01036 } else {
01037 return $ret;
01038 }
01039 }
01040
01051 static function normalizeEntity( $name ) {
01052 global $wgHtmlEntities, $wgHtmlEntityAliases;
01053 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
01054 return "&{$wgHtmlEntityAliases[$name]};";
01055 } elseif( isset( $wgHtmlEntities[$name] ) ) {
01056 return "&$name;";
01057 } else {
01058 return "&$name;";
01059 }
01060 }
01061
01062 static function decCharReference( $codepoint ) {
01063 $point = intval( $codepoint );
01064 if( Sanitizer::validateCodepoint( $point ) ) {
01065 return sprintf( '&#%d;', $point );
01066 } else {
01067 return null;
01068 }
01069 }
01070
01071 static function hexCharReference( $codepoint ) {
01072 $point = hexdec( $codepoint );
01073 if( Sanitizer::validateCodepoint( $point ) ) {
01074 return sprintf( '&#x%x;', $point );
01075 } else {
01076 return null;
01077 }
01078 }
01079
01085 private static function validateCodepoint( $codepoint ) {
01086 return ($codepoint == 0x09)
01087 || ($codepoint == 0x0a)
01088 || ($codepoint == 0x0d)
01089 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
01090 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
01091 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
01092 }
01093
01103 public static function decodeCharReferences( $text ) {
01104 return preg_replace_callback(
01105 MW_CHAR_REFS_REGEX,
01106 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01107 $text );
01108 }
01109
01114 static function decodeCharReferencesCallback( $matches ) {
01115 if( $matches[1] != '' ) {
01116 return Sanitizer::decodeEntity( $matches[1] );
01117 } elseif( $matches[2] != '' ) {
01118 return Sanitizer::decodeChar( intval( $matches[2] ) );
01119 } elseif( $matches[3] != '' ) {
01120 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
01121 } elseif( $matches[4] != '' ) {
01122 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
01123 }
01124 # Last case should be an ampersand by itself
01125 return $matches[0];
01126 }
01127
01135 static function decodeChar( $codepoint ) {
01136 if( Sanitizer::validateCodepoint( $codepoint ) ) {
01137 return codepointToUtf8( $codepoint );
01138 } else {
01139 return UTF8_REPLACEMENT;
01140 }
01141 }
01142
01151 static function decodeEntity( $name ) {
01152 global $wgHtmlEntities, $wgHtmlEntityAliases;
01153 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
01154 $name = $wgHtmlEntityAliases[$name];
01155 }
01156 if( isset( $wgHtmlEntities[$name] ) ) {
01157 return codepointToUtf8( $wgHtmlEntities[$name] );
01158 } else {
01159 return "&$name;";
01160 }
01161 }
01162
01170 static function attributeWhitelist( $element ) {
01171 static $list;
01172 if( !isset( $list ) ) {
01173 $list = Sanitizer::setupAttributeWhitelist();
01174 }
01175 return isset( $list[$element] )
01176 ? $list[$element]
01177 : array();
01178 }
01179
01185 static function setupAttributeWhitelist() {
01186 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
01187 $block = array_merge( $common, array( 'align' ) );
01188 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
01189 $tablecell = array( 'abbr',
01190 'axis',
01191 'headers',
01192 'scope',
01193 'rowspan',
01194 'colspan',
01195 'nowrap', # deprecated
01196 'width', # deprecated
01197 'height', # deprecated
01198 'bgcolor' # deprecated
01199 );
01200
01201 # Numbers refer to sections in HTML 4.01 standard describing the element.
01202 # See: http://www.w3.org/TR/html4/
01203 $whitelist = array (
01204 # 7.5.4
01205 'div' => $block,
01206 'center' => $common, # deprecated
01207 'span' => $block, # ??
01208
01209 # 7.5.5
01210 'h1' => $block,
01211 'h2' => $block,
01212 'h3' => $block,
01213 'h4' => $block,
01214 'h5' => $block,
01215 'h6' => $block,
01216
01217 # 7.5.6
01218 # address
01219
01220 # 8.2.4
01221 # bdo
01222
01223 # 9.2.1
01224 'em' => $common,
01225 'strong' => $common,
01226 'cite' => $common,
01227 # dfn
01228 'code' => $common,
01229 # samp
01230 # kbd
01231 'var' => $common,
01232 # abbr
01233 # acronym
01234
01235 # 9.2.2
01236 'blockquote' => array_merge( $common, array( 'cite' ) ),
01237 # q
01238
01239 # 9.2.3
01240 'sub' => $common,
01241 'sup' => $common,
01242
01243 # 9.3.1
01244 'p' => $block,
01245
01246 # 9.3.2
01247 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
01248
01249 # 9.3.4
01250 'pre' => array_merge( $common, array( 'width' ) ),
01251
01252 # 9.4
01253 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
01254 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
01255
01256 # 10.2
01257 'ul' => array_merge( $common, array( 'type' ) ),
01258 'ol' => array_merge( $common, array( 'type', 'start' ) ),
01259 'li' => array_merge( $common, array( 'type', 'value' ) ),
01260
01261 # 10.3
01262 'dl' => $common,
01263 'dd' => $common,
01264 'dt' => $common,
01265
01266 # 11.2.1
01267 'table' => array_merge( $common,
01268 array( 'summary', 'width', 'border', 'frame',
01269 'rules', 'cellspacing', 'cellpadding',
01270 'align', 'bgcolor',
01271 ) ),
01272
01273 # 11.2.2
01274 'caption' => array_merge( $common, array( 'align' ) ),
01275
01276 # 11.2.3
01277 'thead' => array_merge( $common, $tablealign ),
01278 'tfoot' => array_merge( $common, $tablealign ),
01279 'tbody' => array_merge( $common, $tablealign ),
01280
01281 # 11.2.4
01282 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01283 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01284
01285 # 11.2.5
01286 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01287
01288 # 11.2.6
01289 'td' => array_merge( $common, $tablecell, $tablealign ),
01290 'th' => array_merge( $common, $tablecell, $tablealign ),
01291
01292 # 13.2
01293 # Not usually allowed, but may be used for extension-style hooks
01294 # such as <math> when it is rasterized
01295 'img' => array_merge( $common, array( 'alt' ) ),
01296
01297 # 15.2.1
01298 'tt' => $common,
01299 'b' => $common,
01300 'i' => $common,
01301 'big' => $common,
01302 'small' => $common,
01303 'strike' => $common,
01304 's' => $common,
01305 'u' => $common,
01306
01307 # 15.2.2
01308 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
01309 # basefont
01310
01311 # 15.3
01312 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
01313
01314 # XHTML Ruby annotation text module, simple ruby only.
01315 # http:
01316 'ruby' => $common,
01317 # rbc
01318 # rtc
01319 'rb' => $common,
01320 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
01321 'rp' => $common,
01322
01323 # MathML root element, where used for extensions
01324 # 'title' may not be 100% valid here; it's XHTML
01325 # http://www.w3.org/TR/REC-MathML/
01326 'math' => array( 'class', 'style', 'id', 'title' ),
01327 );
01328 return $whitelist;
01329 }
01330
01341 static function stripAllTags( $text ) {
01342 # Actual <tags>
01343 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01344
01345 # Normalize &entities and whitespace
01346 $text = self::decodeCharReferences( $text );
01347 $text = self::normalizeWhitespace( $text );
01348
01349 return $text;
01350 }
01351
01362 static function hackDocType() {
01363 global $wgHtmlEntities;
01364 $out = "<!DOCTYPE html [\n";
01365 foreach( $wgHtmlEntities as $entity => $codepoint ) {
01366 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01367 }
01368 $out .= "]>\n";
01369 return $out;
01370 }
01371
01372 static function cleanUrl( $url ) {
01373 # Normalize any HTML entities in input. They will be
01374 # re-escaped by makeExternalLink().
01375 $url = Sanitizer::decodeCharReferences( $url );
01376
01377 # Escape any control characters introduced by the above step
01378 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
01379
01380 # Validate hostname portion
01381 $matches = array();
01382 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01383 list( /* $whole */, $protocol, $host, $rest ) = $matches;
01384
01385 // Characters that will be ignored in IDNs.
01386 // http://tools.ietf.org/html/3454#section-3.1
01387 // Strip them before further processing so blacklists and such work.
01388 $strip = "/
01389 \\s| # general whitespace
01390 \xc2\xad| # 00ad SOFT HYPHEN
01391 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01392 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01393 \xe2\x81\xa0| # 2060 WORD JOINER
01394 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01395 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
01396 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01397 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01398 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01399 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01400 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01401 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
01402 /xuD";
01403
01404 $host = preg_replace( $strip, '', $host );
01405
01406 // @fixme: validate hostnames here
01407
01408 return $protocol . $host . $rest;
01409 } else {
01410 return $url;
01411 }
01412 }
01413
01414 }