diff -r 902822492a68 -r fe660c52c48f includes/wikiengine/Tables.php --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/includes/wikiengine/Tables.php Wed Jun 13 16:07:17 2007 -0400 @@ -0,0 +1,997 @@ + + */ + + global $mStripState, $wgRandomKey; + $mStripState = Array(); + + $attrib = '[a-zA-Z0-9]'; + $space = '[\x09\x0a\x0d\x20]'; + + define( 'MW_CHAR_REFS_REGEX', + '/&([A-Za-z0-9]+); + |&\#([0-9]+); + |&\#x([0-9A-Za-z]+); + |&\#X([0-9A-Za-z]+); + |(&)/x' ); + + define( 'MW_ATTRIBS_REGEX', + "/(?:^|$space)($attrib+) + ($space*=$space* + (?: + # The attribute value: quoted or alone + ".'"'."([^<".'"'."]*)".'"'." + | '([^<']*)' + | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) + | (\#[0-9a-fA-F]+) # Technically wrong, but lots of + # colors are specified like this. + # We'll be normalizing it. + ) + )?(?=$space|\$)/sx" ); + + /** + * emulate mediawiki parser, including stripping, etc. + * + * @param string $text the text to parse + * @return string + * @access public + */ + + function process_tables( $text ) + { + // include some globals, do some parser stuff that would normally be done in the parent parser function + global $mStripState; + $x =& $mStripState; + //$text = mwStrip( $text, $x ); + + // parse the text + $text = doTableStuff($text); + + // Unstrip it + // $text = unstrip( $text, $mStripState ); + // $text = unstripNoWiki( $text, $mStripState ); + //die('

'.print_r($mStripState, true).'

'); + return $text; + } + + /** + * parse the wiki syntax used to render tables + * + * @param string $t the text to parse + * @return string + * @access private + */ + function doTableStuff( $t ) { + + $t = explode ( "\n" , $t ) ; + $td = array () ; # Is currently a td tag open? + $ltd = array () ; # Was it TD or TH? + $tr = array () ; # Is currently a tr tag open? + $ltr = array () ; # tr attributes + $has_opened_tr = array(); # Did this table open a element? + $indent_level = 0; # indent level of the table + foreach ( $t AS $k => $x ) + { + $x = trim ( $x ) ; + $fc = substr ( $x , 0 , 1 ) ; + if ( preg_match( '/^(:*)\{\|(.*)$/', $x, $matches ) ) { + $indent_level = strlen( $matches[1] ); + + $attributes = unstripForHTML( $matches[2] ); + + $t[$k] = str_repeat( '

', $indent_level ) . + '' ; + array_push ( $td , false ) ; + array_push ( $ltd , '' ) ; + array_push ( $tr , false ) ; + array_push ( $ltr , '' ) ; + array_push ( $has_opened_tr, false ); + } + else if ( count ( $td ) == 0 ) { } # Don't do any of the following + else if ( '|}' == substr ( $x , 0 , 2 ) ) { + $z = "" . substr ( $x , 2); + $l = array_pop ( $ltd ) ; + if ( !array_pop ( $has_opened_tr ) ) $z = "" . $z ; + if ( array_pop ( $tr ) ) $z = '' . $z ; + if ( array_pop ( $td ) ) $z = '' . $z ; + array_pop ( $ltr ) ; + $t[$k] = $z . str_repeat( '

', $indent_level ); + } + else if ( '|-' == substr ( $x , 0 , 2 ) ) { # Allows for |--------------- + $x = substr ( $x , 1 ) ; + while ( $x != '' && substr ( $x , 0 , 1 ) == '-' ) $x = substr ( $x , 1 ) ; + $z = '' ; + $l = array_pop ( $ltd ) ; + array_pop ( $has_opened_tr ); + array_push ( $has_opened_tr , true ) ; + if ( array_pop ( $tr ) ) $z = '' . $z ; + if ( array_pop ( $td ) ) $z = '' . $z ; + array_pop ( $ltr ) ; + $t[$k] = $z ; + array_push ( $tr , false ) ; + array_push ( $td , false ) ; + array_push ( $ltd , '' ) ; + $attributes = unstripForHTML( $x ); + array_push ( $ltr , fixTagAttributes( $attributes, 'tr' ) ) ; + } + else if ( '|' == $fc || '!' == $fc || '|+' == substr ( $x , 0 , 2 ) ) { # Caption + # $x is a table row + if ( '|+' == substr ( $x , 0 , 2 ) ) { + $fc = '+' ; + $x = substr ( $x , 1 ) ; + } + $after = substr ( $x , 1 ) ; + if ( $fc == '!' ) $after = str_replace ( '!!' , '||' , $after ) ; + + // Split up multiple cells on the same line. + // FIXME: This can result in improper nesting of tags processed + // by earlier parser steps, but should avoid splitting up eg + // attribute values containing literal "||". + $after = wfExplodeMarkup( '||', $after ); + + $t[$k] = '' ; + + # Loop through each table cell + foreach ( $after AS $theline ) + { + $z = '' ; + if ( $fc != '+' ) + { + $tra = array_pop ( $ltr ) ; + if ( !array_pop ( $tr ) ) $z = '\n" ; + array_push ( $tr , true ) ; + array_push ( $ltr , '' ) ; + array_pop ( $has_opened_tr ); + array_push ( $has_opened_tr , true ) ; + } + + $l = array_pop ( $ltd ) ; + if ( array_pop ( $td ) ) $z = '' . $z ; + if ( $fc == '|' ) $l = 'td' ; + else if ( $fc == '!' ) $l = 'th' ; + else if ( $fc == '+' ) $l = 'caption' ; + else $l = '' ; + array_push ( $ltd , $l ) ; + + # Cell parameters + $y = explode ( '|' , $theline , 2 ) ; + # Note that a '|' inside an invalid link should not + # be mistaken as delimiting cell parameters + if ( strpos( $y[0], '[[' ) !== false ) { + $y = array ($theline); + } + if ( count ( $y ) == 1 ) + $y = "{$z}<{$l}>{$y[0]}" ; + else { + $attributes = unstripForHTML( $y[0] ); + $y = "{$z}<{$l}".fixTagAttributes($attributes, $l).">{$y[1]}" ; + } + $t[$k] .= $y ; + array_push ( $td , true ) ; + } + } + } + + # Closing open td, tr && table + while ( count ( $td ) > 0 ) + { + $l = array_pop ( $ltd ) ; + if ( array_pop ( $td ) ) $t[] = '' ; + if ( array_pop ( $tr ) ) $t[] = '' ; + if ( !array_pop ( $has_opened_tr ) ) $t[] = "" ; + $t[] = '' ; + } + + $t = implode ( "\n" , $t ) ; + + # special case: don't return empty table + if($t == "\n\n

") + $t = ''; + return $t ; + } + + /** + * Take a tag soup fragment listing an HTML element's attributes + * and normalize it to well-formed XML, discarding unwanted attributes. + * Output is safe for further wikitext processing, with escaping of + * values that could trigger problems. + * + * - Normalizes attribute names to lowercase + * - Discards attributes not on a whitelist for the given element + * - Turns broken or invalid entities into plaintext + * - Double-quotes all attribute values + * - Attributes without values are given the name as attribute + * - Double attributes are discarded + * - Unsafe style attributes are discarded + * - Prepends space if there are attributes. + * + * @param string $text + * @param string $element + * @return string + */ + function fixTagAttributes( $text, $element ) { + if( trim( $text ) == '' ) { + return ''; + } + + $stripped = validateTagAttributes( + decodeTagAttributes( $text ), $element ); + + $attribs = array(); + foreach( $stripped as $attribute => $value ) { + $encAttribute = htmlspecialchars( $attribute ); + $encValue = safeEncodeAttribute( $value ); + + $attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // " + } + return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; + } + + /** + * Encode an attribute value for HTML tags, with extra armoring + * against further wiki processing. + * @param $text + * @return HTML-encoded text fragment + */ + function safeEncodeAttribute( $text ) { + $encValue= encodeAttribute( $text ); + + # Templates and links may be expanded in later parsing, + # creating invalid or dangerous output. Suppress this. + $encValue = strtr( $encValue, array( + '<' => '<', // This should never happen, + '>' => '>', // we've received invalid input + '"' => '"', // which should have been escaped. + '{' => '{', + '[' => '[', + "''" => '''', + 'ISBN' => 'ISBN', + 'RFC' => 'RFC', + 'PMID' => 'PMID', + '|' => '|', + '__' => '__', + ) ); + + return $encValue; + } + + /** + * Encode an attribute value for HTML output. + * @param $text + * @return HTML-encoded text fragment + */ + function encodeAttribute( $text ) { + $encValue = htmlspecialchars( $text ); + + // Whitespace is normalized during attribute decoding, + // so if we've been passed non-spaces we must encode them + // ahead of time or they won't be preserved. + $encValue = strtr( $encValue, array( + "\n" => ' ', + "\r" => ' ', + "\t" => ' ', + ) ); + + return $encValue; + } + + function unstripForHTML( $text ) { + global $mStripState; + $text = unstrip( $text, $mStripState ); + $text = unstripNoWiki( $text, $mStripState ); + return $text; + } + + /** + * Always call this after unstrip() to preserve the order + * + * @private + */ + function unstripNoWiki( $text, &$state ) { + if ( !isset( $state['nowiki'] ) ) { + return $text; + } + + # TODO: good candidate for FSS + $text = strtr( $text, $state['nowiki'] ); + + return $text; + } + + /** + * Take an array of attribute names and values and normalize or discard + * illegal values for the given element type. + * + * - Discards attributes not on a whitelist for the given element + * - Unsafe style attributes are discarded + * + * @param array $attribs + * @param string $element + * @return array + * + * @todo Check for legal values where the DTD limits things. + * @todo Check for unique id attribute :P + */ + function validateTagAttributes( $attribs, $element ) { + $whitelist = array_flip( attributeWhitelist( $element ) ); + $out = array(); + foreach( $attribs as $attribute => $value ) { + if( !isset( $whitelist[$attribute] ) ) { + continue; + } + # Strip javascript "expression" from stylesheets. + # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp + if( $attribute == 'style' ) { + $value = checkCss( $value ); + if( $value === false ) { + # haxx0r + continue; + } + } + + if ( $attribute === 'id' ) + $value = escapeId( $value ); + + // If this attribute was previously set, override it. + // Output should only have one attribute of each name. + $out[$attribute] = $value; + } + return $out; + } + + /** + * Pick apart some CSS and check it for forbidden or unsafe structures. + * Returns a sanitized string, or false if it was just too evil. + * + * Currently URL references, 'expression', 'tps' are forbidden. + * + * @param string $value + * @return mixed + */ + function checkCss( $value ) { + $stripped = decodeCharReferences( $value ); + + // Remove any comments; IE gets token splitting wrong + $stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped ); + $value = $stripped; + + // ... and continue checks + $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', + 'codepointToUtf8(hexdec("$1"))', $stripped ); + $stripped = str_replace( '\\', '', $stripped ); + if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is', + $stripped ) ) { + # haxx0r + return false; + } + + return $value; + } + + /** + * Decode any character references, numeric or named entities, + * in the text and return a UTF-8 string. + * + * @param string $text + * @return string + * @access public + * @static + */ + function decodeCharReferences( $text ) { + return preg_replace_callback( + MW_CHAR_REFS_REGEX, + 'decodeCharReferencesCallback', + $text ); + } + + /** + * Fetch the whitelist of acceptable attributes for a given + * element name. + * + * @param string $element + * @return array + */ + function attributeWhitelist( $element ) { + static $list; + if( !isset( $list ) ) { + $list = setupAttributeWhitelist(); + } + return isset( $list[$element] ) + ? $list[$element] + : array(); + } + + /** + * @todo Document it a bit + * @return array + */ + function setupAttributeWhitelist() { + $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); + $block = array_merge( $common, array( 'align' ) ); + $tablealign = array( 'align', 'char', 'charoff', 'valign' ); + $tablecell = array( 'abbr', + 'axis', + 'headers', + 'scope', + 'rowspan', + 'colspan', + 'nowrap', # deprecated + 'width', # deprecated + 'height', # deprecated + 'bgcolor' # deprecated + ); + + # Numbers refer to sections in HTML 4.01 standard describing the element. + # See: http://www.w3.org/TR/html4/ + $whitelist = array ( + # 7.5.4 + 'div' => $block, + 'center' => $common, # deprecated + 'span' => $block, # ?? + + # 7.5.5 + 'h1' => $block, + 'h2' => $block, + 'h3' => $block, + 'h4' => $block, + 'h5' => $block, + 'h6' => $block, + + # 7.5.6 + # address + + # 8.2.4 + # bdo + + # 9.2.1 + 'em' => $common, + 'strong' => $common, + 'cite' => $common, + # dfn + 'code' => $common, + # samp + # kbd + 'var' => $common, + # abbr + # acronym + + # 9.2.2 + 'blockquote' => array_merge( $common, array( 'cite' ) ), + # q + + # 9.2.3 + 'sub' => $common, + 'sup' => $common, + + # 9.3.1 + 'p' => $block, + + # 9.3.2 + 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), + + # 9.3.4 + 'pre' => array_merge( $common, array( 'width' ) ), + + # 9.4 + 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), + 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), + + # 10.2 + 'ul' => array_merge( $common, array( 'type' ) ), + 'ol' => array_merge( $common, array( 'type', 'start' ) ), + 'li' => array_merge( $common, array( 'type', 'value' ) ), + + # 10.3 + 'dl' => $common, + 'dd' => $common, + 'dt' => $common, + + # 11.2.1 + 'table' => array_merge( $common, + array( 'summary', 'width', 'border', 'frame', + 'rules', 'cellspacing', 'cellpadding', + 'align', 'bgcolor', + ) ), + + # 11.2.2 + 'caption' => array_merge( $common, array( 'align' ) ), + + # 11.2.3 + 'thead' => array_merge( $common, $tablealign ), + 'tfoot' => array_merge( $common, $tablealign ), + 'tbody' => array_merge( $common, $tablealign ), + + # 11.2.4 + 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), + 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), + + # 11.2.5 + 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), + + # 11.2.6 + 'td' => array_merge( $common, $tablecell, $tablealign ), + 'th' => array_merge( $common, $tablecell, $tablealign ), + + # 12.2 + # added by dan + 'a' => array_merge( $common, array( 'href', 'name' ) ), + + # 13.2 + # added by dan + 'img' => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ), + + # 15.2.1 + 'tt' => $common, + 'b' => $common, + 'i' => $common, + 'big' => $common, + 'small' => $common, + 'strike' => $common, + 's' => $common, + 'u' => $common, + + # 15.2.2 + 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), + # basefont + + # 15.3 + 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), + + # XHTML Ruby annotation text module, simple ruby only. + # http://www.w3c.org/TR/ruby/ + 'ruby' => $common, + # rbc + # rtc + 'rb' => $common, + 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), + 'rp' => $common, + + # For compatibility with the XHTML parser. + 'nowiki' => array(), + 'noinclude' => array(), + 'nodisplay' => array(), + + # XHTML stuff + 'acronym' => $common + ); + return $whitelist; + } + + /** + * Given a value escape it so that it can be used in an id attribute and + * return it, this does not validate the value however (see first link) + * + * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters + * in the id and + * name attributes + * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute + * + * @bug 4461 + * + * @static + * + * @param string $id + * @return string + */ + function escapeId( $id ) { + static $replace = array( + '%3A' => ':', + '%' => '.' + ); + + $id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) ); + + return str_replace( array_keys( $replace ), array_values( $replace ), $id ); + } + + /** + * More or less "markup-safe" explode() + * Ignores any instances of the separator inside <...> + * @param string $separator + * @param string $text + * @return array + */ + function wfExplodeMarkup( $separator, $text ) { + $placeholder = "\x00"; + + // Just in case... + $text = str_replace( $placeholder, '', $text ); + + // Trim stuff + $replacer = new ReplacerCallback( $separator, $placeholder ); + $cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text ); + + $items = explode( $separator, $cleaned ); + foreach( $items as $i => $str ) { + $items[$i] = str_replace( $placeholder, $separator, $str ); + } + + return $items; + } + + class ReplacerCallback { + function ReplacerCallback( $from, $to ) { + $this->from = $from; + $this->to = $to; + } + + function go( $matches ) { + return str_replace( $this->from, $this->to, $matches[1] ); + } + } + + /** + * Return an associative array of attribute names and values from + * a partial tag string. Attribute names are forces to lowercase, + * character references are decoded to UTF-8 text. + * + * @param string + * @return array + */ + function decodeTagAttributes( $text ) { + $attribs = array(); + + if( trim( $text ) == '' ) { + return $attribs; + } + + $pairs = array(); + if( !preg_match_all( + MW_ATTRIBS_REGEX, + $text, + $pairs, + PREG_SET_ORDER ) ) { + return $attribs; + } + + foreach( $pairs as $set ) { + $attribute = strtolower( $set[1] ); + $value = getTagAttributeCallback( $set ); + + // Normalize whitespace + $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); + $value = trim( $value ); + + // Decode character references + $attribs[$attribute] = decodeCharReferences( $value ); + } + return $attribs; + } + + /** + * Pick the appropriate attribute value from a match set from the + * MW_ATTRIBS_REGEX matches. + * + * @param array $set + * @return string + * @access private + */ + function getTagAttributeCallback( $set ) { + if( isset( $set[6] ) ) { + # Illegal #XXXXXX color with no quotes. + return $set[6]; + } elseif( isset( $set[5] ) ) { + # No quotes. + return $set[5]; + } elseif( isset( $set[4] ) ) { + # Single-quoted + return $set[4]; + } elseif( isset( $set[3] ) ) { + # Double-quoted + return $set[3]; + } elseif( !isset( $set[2] ) ) { + # In XHTML, attributes must have a value. + # For 'reduced' form, return explicitly the attribute name here. + return $set[1]; + } else { + die_friendly('Parser error', "

Tag conditions not met. This should never happen and is a bug.

" ); + } + } + + /** + * Strips and renders nowiki, pre, math, hiero + * If $render is set, performs necessary rendering operations on plugins + * Returns the text, and fills an array with data needed in unstrip() + * If the $state is already a valid strip state, it adds to the state + * + * @param bool $stripcomments when set, HTML comments + * will be stripped in addition to other tags. This is important + * for section editing, where these comments cause confusion when + * counting the sections in the wikisource + * + * @param array dontstrip contains tags which should not be stripped; + * used to prevent stipping of when saving (fixes bug 2700) + * + * @access private + */ + function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) { + global $wgRandomKey; + $render = true; + + $wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff)); + $uniq_prefix =& $wgRandomKey; + $commentState = array(); + + $elements = array( 'nowiki', 'gallery' ); + + # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700) + foreach ( $elements AS $k => $v ) { + if ( !in_array ( $v , $dontstrip ) ) continue; + unset ( $elements[$k] ); + } + + $matches = array(); + $text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix ); + + foreach( $matches as $marker => $data ) { + list( $element, $content, $params, $tag ) = $data; + if( $render ) { + $tagName = strtolower( $element ); + switch( $tagName ) { + case '!--': + // Comment + if( substr( $tag, -3 ) == '-->' ) { + $output = $tag; + } else { + // Unclosed comment in input. + // Close it so later stripping can remove it + $output = "$tag-->"; + } + break; + case 'html': + if( $wgRawHtml ) { + $output = $content; + break; + } + // Shouldn't happen otherwise. :) + case 'nowiki': + $output = wfEscapeHTMLTagsOnly( $content ); + break; + default: + } + } else { + // Just stripping tags; keep the source + $output = $tag; + } + + // Unstrip the output, because unstrip() is no longer recursive so + // it won't do it itself + $output = unstrip( $output, $state ); + + if( !$stripcomments && $element == '!--' ) { + $commentState[$marker] = $output; + } elseif ( $element == 'html' || $element == 'nowiki' ) { + $state['nowiki'][$marker] = $output; + } else { + $state['general'][$marker] = $output; + } + } + + # Unstrip comments unless explicitly told otherwise. + # (The comments are always stripped prior to this point, so as to + # not invoke any extension tags / parser hooks contained within + # a comment.) + if ( !$stripcomments ) { + // Put them all back and forget them + $text = strtr( $text, $commentState ); + } + + return $text; + } + + /** + * Replaces all occurrences of HTML-style comments and the given tags + * in the text with a random marker and returns teh next text. The output + * parameter $matches will be an associative array filled with data in + * the form: + * 'UNIQ-xxxxx' => array( + * 'element', + * 'tag content', + * array( 'param' => 'x' ), + * 'tag content' ) ) + * + * @param $elements list of element names. Comments are always extracted. + * @param $text Source text string. + * @param $uniq_prefix + * + * @access private + * @static + */ + function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){ + static $n = 1; + $stripped = ''; + $matches = array(); + + $taglist = implode( '|', $elements ); + $start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i"; + + while ( '' != $text ) { + $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE ); + $stripped .= $p[0]; + if( count( $p ) < 5 ) { + break; + } + if( count( $p ) > 5 ) { + // comment + $element = $p[4]; + $attributes = ''; + $close = ''; + $inside = $p[5]; + } else { + // tag + $element = $p[1]; + $attributes = $p[2]; + $close = $p[3]; + $inside = $p[4]; + } + + $marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU'; + $stripped .= $marker; + + if ( $close === '/>' ) { + // Empty element tag, + $content = null; + $text = $inside; + $tail = null; + } else { + if( $element == '!--' ) { + $end = '/(-->)/'; + } else { + $end = "/(<\\/$element\\s*>)/i"; + } + $q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE ); + $content = $q[0]; + if( count( $q ) < 3 ) { + # No end tag -- let it run out to the end of the text. + $tail = ''; + $text = ''; + } else { + $tail = $q[1]; + $text = $q[2]; + } + } + + $matches[$marker] = array( $element, + $content, + decodeTagAttributes( $attributes ), + "<$element$attributes$close$content$tail" ); + } + return $stripped; + } + + /** + * Escape html tags + * Basically replacing " > and < with HTML entities ( ", >, <) + * + * @param $in String: text that might contain HTML tags. + * @return string Escaped string + */ + function wfEscapeHTMLTagsOnly( $in ) { + return str_replace( + array( '"', '>', '<' ), + array( '"', '>', '<' ), + $in ); + } + + /** + * Restores pre, math, and other extensions removed by strip() + * + * always call unstripNoWiki() after this one + * @private + */ + function unstrip( $text, &$state ) { + if ( !isset( $state['general'] ) ) { + return $text; + } + + # TODO: good candidate for FSS + $text = strtr( $text, $state['general'] ); + + return $text; + } + + /** + * Return UTF-8 string for a codepoint if that is a valid + * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. + * @param int $codepoint + * @return string + * @private + */ + function decodeChar( $codepoint ) { + if( validateCodepoint( $codepoint ) ) { + return codepointToUtf8( $codepoint ); + } else { + return UTF8_REPLACEMENT; + } + } + + /** + * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, + * return the UTF-8 encoding of that character. Otherwise, returns + * pseudo-entity source (eg &foo;) + * + * @param string $name + * @return string + */ + function decodeEntity( $name ) { + global $wgHtmlEntities; + if( isset( $wgHtmlEntities[$name] ) ) { + return codepointToUtf8( $wgHtmlEntities[$name] ); + } else { + return "&$name;"; + } + } + + /** + * Returns true if a given Unicode codepoint is a valid character in XML. + * @param int $codepoint + * @return bool + */ + function validateCodepoint( $codepoint ) { + return ($codepoint == 0x09) + || ($codepoint == 0x0a) + || ($codepoint == 0x0d) + || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) + || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) + || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); + } + +/** + * Return UTF-8 sequence for a given Unicode code point. + * May die if fed out of range data. + * + * @param $codepoint Integer: + * @return String + * @public + */ +function codepointToUtf8( $codepoint ) { + if($codepoint < 0x80) return chr($codepoint); + if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) . + chr($codepoint & 0x3f | 0x80); + if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) . + chr($codepoint >> 6 & 0x3f | 0x80) . + chr($codepoint & 0x3f | 0x80); + if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) . + chr($codepoint >> 12 & 0x3f | 0x80) . + chr($codepoint >> 6 & 0x3f | 0x80) . + chr($codepoint & 0x3f | 0x80); + + echo "Asked for code outside of range ($codepoint)\n"; + die( -1 ); +} + + /** + * @param string $matches + * @return string + */ + function decodeCharReferencesCallback( $matches ) { + if( $matches[1] != '' ) { + return Sanitizer::decodeEntity( $matches[1] ); + } elseif( $matches[2] != '' ) { + return Sanitizer::decodeChar( intval( $matches[2] ) ); + } elseif( $matches[3] != '' ) { + return Sanitizer::decodeChar( hexdec( $matches[3] ) ); + } elseif( $matches[4] != '' ) { + return Sanitizer::decodeChar( hexdec( $matches[4] ) ); + } + # Last case should be an ampersand by itself + return $matches[0]; + } + +?>