From ea50c2acb68494b8a4dfd36d7dbf8c488ee8b4cf Mon Sep 17 00:00:00 2001 From: acli Date: Sun, 19 Jan 2003 06:15:44 +0000 Subject: [PATCH] Preliminary fix of the CGI.pm problem of always assuming that everything is in ISO-8859-1. A new C4::Charset module (tentative name) has been created to guess the charset of a piece of HTML markup. The CGI programs will be modified to use this module as they are encountered during translation. --- C4/Auth.pm | 12 ++- C4/Charset.pm | 109 +++++++++++++++++++++++++++ acqui.simple/addbooks.pl | 6 +- admin-home.pl | 6 +- admin/marc_subfields_structure.pl | 6 +- admin/systempreferences.pl | 6 +- catalogue-home.pl | 6 +- mainpage.pl | 6 +- t/Charset.t | 147 +++++++++++++++++++++++++++++++++++++ 9 files changed, 295 insertions(+), 9 deletions(-) create mode 100644 C4/Charset.pm create mode 100644 t/Charset.t diff --git a/C4/Auth.pm b/C4/Auth.pm index cba4932..b70eb16 100644 --- a/C4/Auth.pm +++ b/C4/Auth.pm @@ -23,6 +23,7 @@ use Digest::MD5 qw(md5_base64); require Exporter; use C4::Context; use C4::Output; # to get the template +use C4::Charset; use C4::Circulation::Circ2; # getpatroninformation use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); @@ -49,7 +50,10 @@ C4::Auth - Authenticates Koha users flagsrequired => {borrow => 1}, }); - print $query->header(-cookie => $cookie), $template->output; + print $query->header( + -type => guesstype($template->output), + -cookie => $cookie + ), $template->output; =head1 DESCRIPTION @@ -284,7 +288,6 @@ sub checkauth { -expires => '+1y'); } return ($userid, $cookie, $sessionID, $flags); - exit; } # else we have a problem... # get the inputs from the incoming query @@ -305,7 +308,10 @@ sub checkauth { $cookie=$query->cookie(-name => 'sessionID', -value => $sessionID, -expires => '+1y'); - print $query->header(-cookie=>$cookie), $template->output; + print $query->header( + -type => guesstype($template->output), + -cookie => $cookie + ), $template->output; exit; } diff --git a/C4/Charset.pm b/C4/Charset.pm new file mode 100644 index 0000000..e16db6d --- /dev/null +++ b/C4/Charset.pm @@ -0,0 +1,109 @@ +package C4::Charset; + +# $Id$ + +#package to work around problems in HTTP headers +# Note: This is just a utility module; it should not be instantiated. + + +# Copyright 2003 Katipo Communications +# +# This file is part of Koha. +# +# Koha is free software; you can redistribute it and/or modify it under the +# terms of the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any later +# version. +# +# Koha is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place, +# Suite 330, Boston, MA 02111-1307 USA + +use strict; +require Exporter; + +use vars qw($VERSION @ISA @EXPORT); + +# set the version for version checking +$VERSION = 0.01; + +=head1 NAME + +C4::Charset - Functions for handling charsets in HTML pages + +=head1 SYNOPSIS + + use C4::Charset; + + print $query->header(-type => C4::Charset::gettype($output)), $output; + +=head1 DESCRIPTION + +The functions in this module peek into a piece of HTML and return strings +related to the (guessed) charset. + +=head1 FUNCTIONS + +=over 2 + +=cut + +@ISA = qw(Exporter); +@EXPORT = qw( + &guesscharset + &guesstype + ); + +=pod + + &guesscharset($output) + +"Guesses" the charset from the some HTML that would be output. + +C<$output> is the HTML page to be output. If it contains a META tag +with a Content-Type, the tag will be scanned for a language code. +This code is returned if it is found; undef is returned otherwise. + +This function only does sloppy guessing; it will be confused by +unexpected things like SGML comments. What it basically does is to +grab something that looks like a META tag and scan it. + +=cut + +sub guesscharset ($) { + my($html) = @_; + my $charset = undef; + local($`, $&, $', $1, $2, $3); + # FIXME... These regular expressions will miss a lot of valid tags! + if ($html =~ //is) { + $charset = $3; + } elsif ($html =~ //is) { + $charset = $2; + } + return $charset; +} # guess + +sub guesstype ($) { + my($html) = @_; + my $charset = guesscharset($html); + return defined $charset? "text/html; charset=$charset": "text/html"; +} + +#--------------------------------- + +END { } # module clean-up code here (global destructor) + +1; +__END__ + +=back + +=head1 AUTHOR + +Koha Developement team + +=cut diff --git a/acqui.simple/addbooks.pl b/acqui.simple/addbooks.pl index 7b6aed3..e566cdd 100755 --- a/acqui.simple/addbooks.pl +++ b/acqui.simple/addbooks.pl @@ -39,6 +39,7 @@ use C4::Auth; use C4::Catalogue; use C4::Biblio; use C4::Output; +use C4::Charset; use HTML::Template; my $query = new CGI; @@ -53,4 +54,7 @@ my ($template, $loggedinuser, $cookie) flagsrequired => {catalogue => 1}, debug => 1, }); -print $query->header(-cookie => $cookie),$template->output; +print $query->header( + -type => guesstype($template->output), + -cookie => $cookie +),$template->output; diff --git a/admin-home.pl b/admin-home.pl index daa8781..779883a 100755 --- a/admin-home.pl +++ b/admin-home.pl @@ -4,6 +4,7 @@ use strict; use CGI; use C4::Auth; use C4::Output; +use C4::Charset; use C4::Database; use HTML::Template; @@ -18,4 +19,7 @@ my ($template, $loggedinuser, $cookie) }); $template->param(loggeninuser => $loggedinuser); -print $query->header(-cookie => $cookie),$template->output; +print $query->header( + -type => guesstype($template->output), + -cookie => $cookie +),$template->output; diff --git a/admin/marc_subfields_structure.pl b/admin/marc_subfields_structure.pl index 5c7ebd1..67facc2 100755 --- a/admin/marc_subfields_structure.pl +++ b/admin/marc_subfields_structure.pl @@ -20,6 +20,7 @@ use strict; use C4::Output; +use C4::Charset; use C4::Auth; use CGI; use C4::Search; @@ -346,4 +347,7 @@ if ($op eq 'add_form') { } } #---- END $OP eq DEFAULT -print $input->header(-cookie => $cookie), $template->output; +print $input->header( + -type => guesstype($template->output), + -cookie => $cookie +), $template->output; diff --git a/admin/systempreferences.pl b/admin/systempreferences.pl index fe4210e..332657c 100755 --- a/admin/systempreferences.pl +++ b/admin/systempreferences.pl @@ -42,6 +42,7 @@ use CGI; use C4::Auth; use C4::Context; use C4::Output; +use C4::Charset; use C4::Search; use HTML::Template; use C4::Context; @@ -184,4 +185,7 @@ if ($op eq 'add_form') { } } #---- END $OP eq DEFAULT -print $input->header(-cookie => $cookie), $template->output; +print $input->header( + -type => guesstype($template->output), + -cookie => $cookie +), $template->output; diff --git a/catalogue-home.pl b/catalogue-home.pl index df94438..95ae211 100755 --- a/catalogue-home.pl +++ b/catalogue-home.pl @@ -4,6 +4,7 @@ use strict; use CGI; use C4::Auth; use C4::Output; +use C4::Charset; use C4::Database; use HTML::Template; @@ -26,4 +27,7 @@ $template->param(loggedinuser => $loggedinuser, classlist => $classlist, type => 'intranet',); -print $query->header(-cookie => $cookie), $template->output; +print $query->header( + -type => guesstype($template->output), + -cookie => $cookie +), $template->output; diff --git a/mainpage.pl b/mainpage.pl index 5a7f605..5ca4a0f 100755 --- a/mainpage.pl +++ b/mainpage.pl @@ -4,6 +4,7 @@ use strict; require Exporter; use C4::Database; use C4::Output; # contains gettemplate +use C4::Charset; use CGI; use C4::Auth; @@ -17,4 +18,7 @@ my ($template, $loggedinuser, $cookie) debug => 1, }); -print $query->header(-cookie => $cookie), $template->output; +print $query->header( + -type => guesstype($template->output), + -cookie => $cookie +), $template->output; diff --git a/t/Charset.t b/t/Charset.t new file mode 100644 index 0000000..167235a --- /dev/null +++ b/t/Charset.t @@ -0,0 +1,147 @@ +use strict; +use C4::Charset; + +use vars qw( @tests ); +use vars qw( $loaded ); + +BEGIN { + @tests = ( + [ + 'Normal HTML without meta tag', + sub { C4::Charset::guesscharset($_[0]) }, + undef, + <control case +EOF + ], [ + 'Result of guesscharset with normal HTML with irrelevant meta tag', + sub { C4::Charset::guesscharset($_[0]) }, + undef, + < +EOF + ], [ + 'Result of guesscharset with normal HTML with irrelevant meta tag', + sub { C4::Charset::guesstype($_[0]) }, + undef, + < +EOF + ], [ + 'Result of guesscharset with normal HTML with relevant meta tag', + sub { C4::Charset::guesscharset($_[0]) }, + 'big5', + < +EOF + ], [ + 'Result of guesstype with normal HTML with relevant meta tag', + sub { C4::Charset::guesstype($_[0]) }, + 'text/html; charset=big5', + < +EOF + ], [ + 'Variant 1 using single quotes', + sub { C4::Charset::guesstype($_[0]) }, + 'text/html; charset=iso-2022-jp', + < +EOF + ], [ + 'Variant 2 using single quotes', + sub { C4::Charset::guesstype($_[0]) }, + 'text/html; charset=utf-8', + < +EOF + ], [ + 'Unquoted Content-Type', + sub { C4::Charset::guesstype($_[0]) }, + 'text/html; charset=big5', + < +EOF + ], [ + 'XML syntax', + sub { C4::Charset::guesstype($_[0]) }, + 'text/html; charset=iso-8859-2', + < +EOF + ], [ + 'Expected attributes in reverse order', + sub { C4::Charset::guesstype($_[0]) }, + 'text/html; charset=big5', + < +EOF + ], [ + 'Extra whitespace at end', + sub { C4::Charset::guesstype($_[0]) }, + 'text/html; charset=big5', + < +EOF + ], [ + 'Multiple lines', + sub { C4::Charset::guesstype($_[0]) }, + 'text/html; charset=big5', + < +EOF + ], [ + 'With surrounding HTML', + sub { C4::Charset::guesstype($_[0]) }, + 'text/html; charset=us-ascii', + < + +Test case with surrounding HTML + + + +The return value should not be contaiminated with any surround HTML +FIXME: Auth.pm returns in code that can contaminate the charset +FIXME: if we do not explicitly disallow whitespace in the charset + + +EOF + ], +); +} + +BEGIN { $| = 1; printf "1..%d\n", scalar(@tests); } +END {print "not ok 1\n" unless $loaded;} +$loaded = 1; + + +# Run all tests in sequence +for (my $i = 1; $i <= scalar @tests; $i += 1) { + my $test = $tests[$i - 1]; + my($title, $f, $expected, $input) = @$test; + die "not ok $i (malformed test case)\n" + unless @$test == 4 && ref $f eq 'CODE'; + + my $output = &$f($input); + if ( + (!defined $output && !defined $expected) + || (defined $output && defined $expected && $output eq $expected) + ) { + print "ok $i ($title)\n"; + } else { + print "not ok $i ($title: got ", + (defined $output? "\"$output\"": 'undef'), + ', expected ', + (defined $expected? "\"$expected\"": 'undef'), + ")\n"; + } +} + + + + + -- 1.7.2.5