From 6056e1d7ff9b26df2aa3a07ab0c615af1a9bbef4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Sven=20Sch=C3=B6ling?= Date: Wed, 28 Dec 2011 20:24:53 +0100 Subject: [PATCH] Recoding von Daten konzeptuell getrennt. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Es gibt 4 Pfade um Daten in einen Request zu kriegen: - Kommandozeile - Datenbank (für gespeicherte Forms) - QUERY_STRING (http get) - STDIN (http post) Der letzte Teil teilt sich noch einmal in - x-uri-encoded query string (normales http post) - multipart/form-data container (u.a. dateiuploads) Alle Daten in LxOffice können über INPUT_ENCODING das encoding des Formulars überschreiben, das ist nötig weil Javascript da sein eigenes Ding dreht. Das führt dazu, dass alle http Quellen: 1. Normal dekodiert werden müssen 2. Später noch einmal recoded werden müssen, falls ein anderes encoding angegeben ist. Uploads, die mit dem encoding binary geschickt werden, dürfen dagegen garnicht recoded werden. Deshalb wurden bisher alle multipart/form-data Daten davon ausgenommen, was aber zu Fehlern führt, wenn ein Formular gemischte Werte über multipart/form-data sendet. Am einfachsten zu demonstrieren im CsvImport, wenn man 'ä' als sep_char angibt. Dieser Patch ändert das in zwei Container, in die einsortiert wird: - Bekanntes Encoding (wird sofort nach $::form decoded) - Eventuell unbekanntes Encoding (wird decoded, aber in einen Zwischencontainer sortiert, und später recoded, falls nötig) Dadurch muss das recoding nicht mehr in-place gemacht werden. Alles in multipart/form-data wird jetzt decodiert, ausser Dateiuploads (erkennbar am filename Attribut) und explizit binary geflaggtes content-transfer-encoding. Bei kollidierendem INPUT_ENCODING und "content-type; charset" wird erst das content-type charset dekodiert, und dann ein recode aus internem coding in das angefragte INPUT_ENCODING gemacht. --- SL/Request.pm | 129 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 78 insertions(+), 51 deletions(-) diff --git a/SL/Request.pm b/SL/Request.pm index b91bc524c..d339cb7be 100644 --- a/SL/Request.pm +++ b/SL/Request.pm @@ -49,11 +49,17 @@ sub _input_to_hash { $::lxdebug->leave_sub(2); } -sub parse_multipart_formdata { - my ($target, $input) = @_; - my ($name, $filename, $headers_done, $content_type, $boundary_found, $need_cr, $previous); - my $uploads = {}; +sub _parse_multipart_formdata { + my ($target, $temp_target, $input) = @_; + my ($name, $filename, $headers_done, $content_type, $boundary_found, $need_cr, $previous, $encoding, $transfer_encoding); + + # We SHOULD honor encodings and transfer-encodings here, but as hard as I + # looked I couldn't find a reasonably recent webbrowser that makes use of + # these. Transfer encoding just eats up bandwidth... + # so all I'm going to do is add a fail safe that if anyone ever encounters + # this, it's going to croak so that debugging is easier + $ENV{'CONTENT_TYPE'} =~ /multipart\/form-data\s*;\s*boundary\s*=\s*(.+)$/; my $boundary = '--' . $1; foreach my $line (split m/\n/, $input) { @@ -61,6 +67,7 @@ sub parse_multipart_formdata { if (($line eq $boundary) || ($line eq "$boundary\r")) { ${ $previous } =~ s|\r?\n$|| if $previous; + ${ $previous } = Encode::decode($encoding, $$previous) if $previous && !$filename && !$transfer_encoding eq 'binary'; undef $previous; undef $filename; @@ -69,6 +76,8 @@ sub parse_multipart_formdata { $content_type = "text/plain"; $boundary_found = 1; $need_cr = 0; + $encoding = $::lx_office_conf{system}->{dbcharset} || Common::DEFAULT_CHARSET; + $transfer_encoding = undef; next; } @@ -94,14 +103,29 @@ sub parse_multipart_formdata { substr $line, $-[0], $+[0] - $-[0], ""; } - $previous = _store_value($uploads, $name, '') if ($name); - $target->{FILENAME} = $filename if ($filename); + $previous = _store_value($filename ? $target : $temp_target, $name, '') if ($name); + $temp_target->{FILENAME} = $filename if ($filename); next; } - if ($line =~ m|^content-type\s*:\s*(.*?)$|i) { + if ($line =~ m|^content-type\s*:\s*(.*?)[;\$]|i) { $content_type = $1; + + if ($content_type =~ /^text/ && $line =~ m|;\s*charset\s*:\s*("?)(.*?)\1$|i) { + $encoding = $2; + } + + next; + } + + if ($line =~ m|^content-transfer-encoding\s*=\s*(.*?)$|i) { + $transfer_encoding = lc($1); + if ($transfer_encoding && $transfer_encoding !~ /^[78]bit|binary$/) { + die 'Transfer encodings beyond 7bit/8bit and binary are not implemented.'; + } + + next; } next; @@ -115,54 +139,37 @@ sub parse_multipart_formdata { ${ $previous } =~ s|\r?\n$|| if $previous; $::lxdebug->leave_sub(2); - -} - -sub _request_to_hash { - $::lxdebug->enter_sub(2); - - my ($target, $input) = @_; - my $uploads; - - if (!$ENV{'CONTENT_TYPE'} - || ($ENV{'CONTENT_TYPE'} !~ /multipart\/form-data\s*;\s*boundary\s*=\s*(.+)$/)) { - - $uploads = { }; - _input_to_hash($target, $input); - - } else { - $uploads = _parse_multipart_formdata($target, $input); - } - - $main::lxdebug->leave_sub(2); - return $uploads; } sub _recode_recursively { - $main::lxdebug->enter_sub(); - my ($iconv, $param) = @_; + $::lxdebug->enter_sub; + my ($iconv, $from, $to) = @_; - if (any { ref $param eq $_ } qw(Form HASH)) { - foreach my $key (keys %{ $param }) { - if (!ref $param->{$key}) { - # Workaround for a bug: converting $param->{$key} directly + if (any { ref $from eq $_ } qw(Form HASH)) { + for my $key (keys %{ $from }) { + if (!ref $from->{$key}) { + # Workaround for a bug: converting $from->{$key} directly # leads to 'undef'. I don't know why. Converting a copy works, # though. - $param->{$key} = $iconv->convert("" . $param->{$key}); + $to->{$key} = $iconv->convert("" . $from->{$key}); } else { - _recode_recursively($iconv, $param->{$key}); + $to->{$key} = {} if 'HASH' eq ref $from->{$key}; + $to->{$key} = [] if 'ARRAY' eq ref $from->{$key}; + _recode_recursively($iconv, $from->{$key}, $to->{$key}); } } - } elsif (ref $param eq 'ARRAY') { - foreach my $idx (0 .. scalar(@{ $param }) - 1) { - if (!ref $param->[$idx]) { - # Workaround for a bug: converting $param->[$idx] directly + } elsif (ref $from eq 'ARRAY') { + foreach my $idx (0 .. scalar(@{ $from }) - 1) { + if (!ref $from->[$idx]) { + # Workaround for a bug: converting $from->[$idx] directly # leads to 'undef'. I don't know why. Converting a copy works, # though. - $param->[$idx] = $iconv->convert("" . $param->[$idx]); + $from->[$idx] = $iconv->convert("" . $from->[$idx]); } else { - _recode_recursively($iconv, $param->[$idx]); + $to->[$idx] = {} if 'HASH' eq ref $from->[$idx]; + $to->[$idx] = [] if 'ARRAY' eq ref $from->[$idx]; + _recode_recursively($iconv, $from->[$idx], $to->[$idx]); } } } @@ -173,29 +180,49 @@ sub read_cgi_input { $::lxdebug->enter_sub; my ($target) = @_; + my $db_charset = $::lx_office_conf{system}->{dbcharset} || Common::DEFAULT_CHARSET; + + # yes i know, copying all those values around isn't terribly efficient, but + # the old version of dumping everything into form and then launching a + # tactical recode nuke at the data is still worse. - _input_to_hash($target, $ENV{QUERY_STRING}) if $ENV{QUERY_STRING}; - _input_to_hash($target, $ARGV[0]) if @ARGV && $ARGV[0]; + # this way the data can at least be recoded on the fly as soon as we get to + # know the source encoding and only in the cases where encoding may be hidden + # among the payload we take the hit of copying the request around + my $temp_target = { }; + + # since both of these can potentially bring their encoding in INPUT_ENCODING + # they get dumped into temp_target + _input_to_hash($temp_target, $ENV{QUERY_STRING}) if $ENV{QUERY_STRING}; + _input_to_hash($temp_target, $ARGV[0]) if @ARGV && $ARGV[0]; - my $uploads; if ($ENV{CONTENT_LENGTH}) { my $content; read STDIN, $content, $ENV{CONTENT_LENGTH}; - $uploads = _request_to_hash($target, $content); + open my $fh, '>:raw', '/tmp/blubb.bin' or die; + print $fh $content; + close $fh; + if ($ENV{'CONTENT_TYPE'} && $ENV{'CONTENT_TYPE'} =~ /multipart\/form-data/) { + # multipart formdata can bring it's own encoding, so give it both + # and let ti decide on it's own + _parse_multipart_formdata($target, $temp_target, $content); + } else { + # normal encoding must be recoded + _input_to_hash($temp_target, $content); + } } if ($target->{RESTORE_FORM_FROM_SESSION_ID}) { my %temp_form; $::auth->restore_form_from_session(delete $target->{RESTORE_FORM_FROM_SESSION_ID}, form => \%temp_form); - _input_to_hash($target, join '&', map { uri_encode($_) . '=' . uri_encode($temp_form{$_}) } keys %temp_form); + _store_value($target, $_, $temp_form{$_}) for keys %temp_form; } - my $db_charset = $::lx_office_conf{system}->{dbcharset} || Common::DEFAULT_CHARSET; - my $encoding = delete $target->{INPUT_ENCODING} || $db_charset; + my $encoding = delete $temp_target->{INPUT_ENCODING} || $db_charset; - _recode_recursively(SL::Iconv->new($encoding, $db_charset), $target); + _recode_recursively(SL::Iconv->new($encoding, $db_charset), $temp_target => $target) if keys %$target; - map { $target->{$_} = $uploads->{$_} } keys %{ $uploads } if $uploads; + map { $target->{$_} = $temp_target->{$_} } keys %{ $temp_target }; $::lxdebug->leave_sub; -- 2.20.1