From: Richard D. <rd...@us...> - 2004-01-31 05:56:06
|
Update of /cvsroot/twiki/twiki/lib In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21775 Modified Files: TWiki.pm Log Message: Codev.EncodeURLsWithUTF8 - support for attachments to I18N-named pages when using UTF-8 URLs. Should also allow Mozilla-based browsers to work with Codev.TWikiOnMainframe. All attachments now go through viewfile, which incurs a redirect but avoids extra code just to convert attachment URLs to site character set. Index: TWiki.pm =================================================================== RCS file: /cvsroot/twiki/twiki/lib/TWiki.pm,v retrieving revision 1.264 retrieving revision 1.265 diff -C2 -r1.264 -r1.265 *** TWiki.pm 29 Jan 2004 04:36:20 -0000 1.264 --- TWiki.pm 30 Jan 2004 07:49:31 -0000 1.265 *************** *** 93,97 **** use vars qw( $basicInitDone $useLocale $localeRegexes $siteLocale $siteCharset ! $siteCharsetOverride $siteLang $upperNational $lowerNational --- 93,97 ---- use vars qw( $basicInitDone $useLocale $localeRegexes $siteLocale $siteCharset ! $siteCharsetOverride $siteLang $urlCharEncoding $upperNational $lowerNational *************** *** 102,107 **** $singleMixedAlphaNumRegex $singleMixedNonAlphaNumRegex $singleMixedNonAlphaRegex $mixedAlphaNumRegex ! $validAsciiStringRegex ! $validUtf8CharRegex $validUtf8StringRegex ); --- 102,106 ---- $singleMixedAlphaNumRegex $singleMixedNonAlphaNumRegex $singleMixedNonAlphaRegex $mixedAlphaNumRegex ! $validAsciiStringRegex $validUtf8CharRegex $validUtf8StringRegex ); *************** *** 617,636 **** } =head2 convertUtf8URLtoSiteCharset( $webName, $topicName ) Return value: ( string $convertedWebName, string $convertedTopicName) - Auto-detect UTF-8 vs. site charset in URL, and convert UTF-8 into site charset. ! FIXME: remove dependence on webname and topicname, i.e. generic encoding ! subroutine =cut to implementation sub convertUtf8URLtoSiteCharset { my ( $webName, $topicName ) = @_; - # FIXME: Make it possible to set $siteCharset independently to - # handle mismatch between 'locale -a' and Perl supported charset names - writeDebug "URL web.topic is $webName.$topicName"; my $fullTopicName = "$webName.$topicName"; ! my ( $urlCharEncoding, $charEncoding ); # Detect character encoding of the full topic name from URL --- 616,633 ---- } + =head2 convertUtf8URLtoSiteCharset( $webName, $topicName ) Return value: ( string $convertedWebName, string $convertedTopicName) Auto-detect UTF-8 vs. site charset in URL, and convert UTF-8 into site charset. ! ! TODO: remove dependence on webname and topicname, i.e. generic encoding ! subroutine. =cut to implementation sub convertUtf8URLtoSiteCharset { my ( $webName, $topicName ) = @_; writeDebug "URL web.topic is $webName.$topicName"; my $fullTopicName = "$webName.$topicName"; ! my $charEncoding; # Detect character encoding of the full topic name from URL *************** *** 666,678 **** writeWarning "Conversion to \$siteCharset '$siteCharset' not supported, or name not recognised - check 'perldoc Encode::Supported'"; } else { ! writeDebug "Converting with Encode, valid encoding is '$charEncoding'"; ! # Convert text, inserting HTML entities for characters that can't be converted ! # - first convert from UTF8 bytes into internal UTF-8) characters $fullTopicName = Encode::decode("utf8", $fullTopicName); ! ##writeDebug "Encode::decode result is $fullTopicName"; ! # - then convert into site charset from internal UTF-8 ! $fullTopicName = Encode::encode( $charEncoding, $fullTopicName ); ! # $fullTopicName = Encode::encode( $charEncoding, $fullTopicName, &Encode::FB_HTMLCREF ); ! ##writeDebug "Encode::encode result is $fullTopicName"; } --- 663,677 ---- writeWarning "Conversion to \$siteCharset '$siteCharset' not supported, or name not recognised - check 'perldoc Encode::Supported'"; } else { ! writeDebug "Converting with Encode, valid 'to' encoding is '$charEncoding'"; ! # Convert text using Encode: ! # - first, convert from UTF8 bytes into internal (UTF-8) characters $fullTopicName = Encode::decode("utf8", $fullTopicName); ! # - then convert into site charset from internal UTF-8, ! # inserting \x{NNNN} for characters that can't be converted ! { ! no strict 'subs'; ! $fullTopicName = Encode::encode( $charEncoding, $fullTopicName, Encode::FB_PERLQQ ); ! } ! ##writeDebug "Encode result is $fullTopicName"; } *************** *** 2088,2092 **** if( $type =~ /^entit(y|ies)$/i ) { # HTML entity encoding ! # TODO: Encode to Unicode first $text =~ s/\"/\&\#034;/g; $text =~ s/\%/\&\#037;/g; --- 2087,2091 ---- if( $type =~ /^entit(y|ies)$/i ) { # HTML entity encoding ! # TODO: Encode characters > 0x7F to Unicode first $text =~ s/\"/\&\#034;/g; $text =~ s/\%/\&\#037;/g; *************** *** 2101,2105 **** } else { # URL encoding - # TODO: Encode to UTF-8 first $text =~ s/[\n\r]/\%3Cbr\%20\%3E/g; $text =~ s/\s+/\%20/g; --- 2100,2103 ---- *************** *** 2110,2114 **** $text =~ s/\>/\%3E/g; $text =~ s/\\/\%5C/g; ! # Encode characters with 8th bit set (ASCII-derived charsets only) $text =~ s/([\x7f-\xff])/'%' . unpack( "H*", $1 ) /ge; } --- 2108,2113 ---- $text =~ s/\>/\%3E/g; $text =~ s/\\/\%5C/g; ! # Encode characters > 0x7F (ASCII-derived charsets only) ! # TODO: Encode to UTF-8 first $text =~ s/([\x7f-\xff])/'%' . unpack( "H*", $1 ) /ge; } *************** *** 2118,2133 **** # ========================= # Encode characters with 8th bit set for use in URLs with non-UTF-8 '$siteCharset' ! # encoding by browser - mainly for Mozilla POST URLs. Ignored when using UTF-8 URLs ! # or when on EBCDIC platforms sub handleIntUrlEncode { my( $theStr, $doExtract ) = @_; - # FIXME: Detect whether UTF-8 URL was used when requesting this page - # Detect EBCDIC platform my $isEbcdic = ( 'A' eq chr(193) ); ! if( $isEbcdic ) { ! # URL encoding breaks EBCDIC, so just strip double quotes $theStr =~ s/^"(.*)"$/$1/; return $theStr; --- 2117,2130 ---- # ========================= # Encode characters with 8th bit set for use in URLs with non-UTF-8 '$siteCharset' ! # encoding by browser - mainly for older browsers with no UTF-8 support. ! # Ignored when using UTF-8 URLs or when on EBCDIC platforms. sub handleIntUrlEncode { my( $theStr, $doExtract ) = @_; # Detect EBCDIC platform my $isEbcdic = ( 'A' eq chr(193) ); ! if( $urlCharEncoding eq 'UTF-8' or $isEbcdic ) { ! # Just strip double quotes, no URL encoding $theStr =~ s/^"(.*)"$/$1/; return $theStr; *************** *** 2186,2191 **** # Make Edit URL unique for every edit - fix for RefreshEditPage. ! # URL encoding fixes Codev.MozillaURLEncodingWithI18N. ! $_[0] =~ s!%EDITURL%!"$scriptUrlPath/edit$scriptSuffix/%INTURLENCODE{\"%WEB%/%TOPIC%\"}%\?t=" . time()!ge; $_[0] =~ s/%NOP{(.*?)}%/$1/gs; # remove NOP tag in template topics but show content --- 2183,2187 ---- # Make Edit URL unique for every edit - fix for RefreshEditPage. ! $_[0] =~ s!%EDITURL%!"$scriptUrlPath/edit$scriptSuffix/%WEB%/%TOPIC%\?t=" . time()!ge; $_[0] =~ s/%NOP{(.*?)}%/$1/gs; # remove NOP tag in template topics but show content *************** *** 2199,2205 **** $_[0] =~ s/%REMOTE_USER%/&handleEnvVariable('REMOTE_USER')/ge; - # Un-encoded topic and web names. Note: In form action, URL encode variables - # that might have 8-bit characters with %INTURLENCODE{"%TOPIC%"}% - - # introduced due to Codev.MozillaURLEncodingWithI18N. $_[0] =~ s/%TOPIC%/$_[1]/g; $_[0] =~ s/%BASETOPIC%/$topicName/g; --- 2195,2198 ---- *************** *** 2220,2229 **** $_[0] =~ s/%PUBURL%/$urlHost$pubUrlPath/g; $_[0] =~ s/%PUBURLPATH%/$pubUrlPath/g; ! $_[0] =~ s/%ATTACHURL%/$urlHost$pubUrlPath\/$_[2]\/$_[1]/g; ! $_[0] =~ s/%ATTACHURLPATH%/$pubUrlPath\/$_[2]\/$_[1]/g; $_[0] =~ s/%ICON{(.*?)}%/&handleIcon($1)/geo; $_[0] =~ s/%URLPARAM{(.*?)}%/&handleUrlParam($1)/ge; $_[0] =~ s/%(URL)?ENCODE{(.*?)}%/&handleUrlEncode($2,1)/ge; # ENCODE is documented, URLENCODE is legacy ! $_[0] =~ s/%INTURLENCODE{(.*?)}%/&handleIntUrlEncode($1,1)/ge; $_[0] =~ s/%DATE%/&getGmDate()/ge; # deprecated, but used in signatures $_[0] =~ s/%GMTIME%/&handleTime("","gmtime")/ge; --- 2213,2222 ---- $_[0] =~ s/%PUBURL%/$urlHost$pubUrlPath/g; $_[0] =~ s/%PUBURLPATH%/$pubUrlPath/g; ! $_[0] =~ s!%ATTACHURL%!$urlHost$scriptUrlPath/viewfile$scriptSuffix/$_[2]/$_[1]?filename=!g; ! $_[0] =~ s!%ATTACHURLPATH%!$scriptUrlPath/viewfile$scriptSuffix/$_[2]/$_[1]?filename=!g; $_[0] =~ s/%ICON{(.*?)}%/&handleIcon($1)/geo; $_[0] =~ s/%URLPARAM{(.*?)}%/&handleUrlParam($1)/ge; $_[0] =~ s/%(URL)?ENCODE{(.*?)}%/&handleUrlEncode($2,1)/ge; # ENCODE is documented, URLENCODE is legacy ! $_[0] =~ s/%INTURLENCODE{(.*?)}%/&handleIntUrlEncode($1,1)/ge; # Deprecated due to UTF-8 URL support $_[0] =~ s/%DATE%/&getGmDate()/ge; # deprecated, but used in signatures $_[0] =~ s/%GMTIME%/&handleTime("","gmtime")/ge; |