From 6056e1d7ff9b26df2aa3a07ab0c615af1a9bbef4 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Sven=20Sch=C3=B6ling?= <s.schoeling@linet-services.de>
Date: Wed, 28 Dec 2011 20:24:53 +0100
Subject: [PATCH] Recoding von Daten konzeptuell getrennt.
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Es gibt 4 Pfade um Daten in einen Request zu kriegen:

  - Kommandozeile
  - Datenbank (fÃ¼r gespeicherte Forms)
  - QUERY_STRING (http get)
  - STDIN (http post)

Der letzte Teil teilt sich noch einmal in

  - x-uri-encoded query string (normales http post)
  - multipart/form-data container (u.a. dateiuploads)

Alle Daten in LxOffice kÃ¶nnen Ã¼ber INPUT_ENCODING das encoding des Formulars
Ã¼berschreiben, das ist nÃ¶tig weil Javascript da sein eigenes Ding dreht.

Das fÃ¼hrt dazu, dass alle http Quellen:

  1. Normal dekodiert werden mÃ¼ssen
  2. SpÃ¤ter noch einmal recoded werden mÃ¼ssen, falls ein anderes encoding
     angegeben ist.

Uploads, die mit dem encoding binary geschickt werden, dÃ¼rfen dagegen garnicht
recoded werden. Deshalb wurden bisher alle multipart/form-data Daten davon
ausgenommen, was aber zu Fehlern fÃ¼hrt, wenn ein Formular gemischte Werte Ã¼ber
multipart/form-data sendet. Am einfachsten zu demonstrieren im CsvImport, wenn
man 'Ã¤' als sep_char angibt.

Dieser Patch Ã¤ndert das in zwei Container, in die einsortiert wird:

  - Bekanntes Encoding (wird sofort nach $::form decoded)
  - Eventuell unbekanntes Encoding (wird decoded, aber in einen
    Zwischencontainer sortiert, und spÃ¤ter recoded, falls nÃ¶tig)

Dadurch muss das recoding nicht mehr in-place gemacht werden.

Alles in multipart/form-data wird jetzt decodiert, ausser Dateiuploads
(erkennbar am filename Attribut) und explizit binary geflaggtes
content-transfer-encoding.

Bei kollidierendem INPUT_ENCODING und "content-type; charset" wird erst das
content-type charset dekodiert, und dann ein recode aus internem coding in das
angefragte INPUT_ENCODING gemacht.
---
 SL/Request.pm | 129 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 78 insertions(+), 51 deletions(-)

diff --git a/SL/Request.pm b/SL/Request.pm
index b91bc524c..d339cb7be 100644
--- a/SL/Request.pm
+++ b/SL/Request.pm
@@ -49,11 +49,17 @@ sub _input_to_hash {
   $::lxdebug->leave_sub(2);
 }
 
-sub parse_multipart_formdata {
-  my ($target, $input) = @_;
-  my ($name, $filename, $headers_done, $content_type, $boundary_found, $need_cr, $previous);
-  my $uploads = {};
+sub _parse_multipart_formdata {
+  my ($target, $temp_target, $input) = @_;
+  my ($name, $filename, $headers_done, $content_type, $boundary_found, $need_cr, $previous, $encoding, $transfer_encoding);
+
+  # We SHOULD honor encodings and transfer-encodings here, but as hard as I
+  # looked I couldn't find a reasonably recent webbrowser that makes use of
+  # these. Transfer encoding just eats up bandwidth...
 
+  # so all I'm going to do is add a fail safe that if anyone ever encounters
+  # this, it's going to croak so that debugging is easier
+  $ENV{'CONTENT_TYPE'} =~ /multipart\/form-data\s*;\s*boundary\s*=\s*(.+)$/;
   my $boundary = '--' . $1;
 
   foreach my $line (split m/\n/, $input) {
@@ -61,6 +67,7 @@ sub parse_multipart_formdata {
 
     if (($line eq $boundary) || ($line eq "$boundary\r")) {
       ${ $previous } =~ s|\r?\n$|| if $previous;
+      ${ $previous } =  Encode::decode($encoding, $$previous) if $previous && !$filename && !$transfer_encoding eq 'binary';
 
       undef $previous;
       undef $filename;
@@ -69,6 +76,8 @@ sub parse_multipart_formdata {
       $content_type   = "text/plain";
       $boundary_found = 1;
       $need_cr        = 0;
+      $encoding       = $::lx_office_conf{system}->{dbcharset} || Common::DEFAULT_CHARSET;
+      $transfer_encoding = undef;
 
       next;
     }
@@ -94,14 +103,29 @@ sub parse_multipart_formdata {
           substr $line, $-[0], $+[0] - $-[0], "";
         }
 
-        $previous           = _store_value($uploads, $name, '') if ($name);
-        $target->{FILENAME} = $filename if ($filename);
+        $previous                = _store_value($filename ? $target : $temp_target, $name, '') if ($name);
+        $temp_target->{FILENAME} = $filename if ($filename);
 
         next;
       }
 
-      if ($line =~ m|^content-type\s*:\s*(.*?)$|i) {
+      if ($line =~ m|^content-type\s*:\s*(.*?)[;\$]|i) {
         $content_type = $1;
+
+        if ($content_type =~ /^text/ && $line =~ m|;\s*charset\s*:\s*("?)(.*?)\1$|i) {
+          $encoding = $2;
+        }
+
+        next;
+      }
+
+      if ($line =~ m|^content-transfer-encoding\s*=\s*(.*?)$|i) {
+        $transfer_encoding = lc($1);
+        if ($transfer_encoding  && $transfer_encoding !~ /^[78]bit|binary$/) {
+          die 'Transfer encodings beyond 7bit/8bit and binary are not implemented.';
+        }
+
+        next;
       }
 
       next;
@@ -115,54 +139,37 @@ sub parse_multipart_formdata {
   ${ $previous } =~ s|\r?\n$|| if $previous;
 
   $::lxdebug->leave_sub(2);
-
-}
-
-sub _request_to_hash {
-  $::lxdebug->enter_sub(2);
-
-  my ($target, $input) = @_;
-  my $uploads;
-
-  if (!$ENV{'CONTENT_TYPE'}
-      || ($ENV{'CONTENT_TYPE'} !~ /multipart\/form-data\s*;\s*boundary\s*=\s*(.+)$/)) {
-
-   $uploads = { };
-    _input_to_hash($target, $input);
-
-  } else {
-   $uploads = _parse_multipart_formdata($target, $input);
-  }
-
-  $main::lxdebug->leave_sub(2);
-  return $uploads;
 }
 
 sub _recode_recursively {
-  $main::lxdebug->enter_sub();
-  my ($iconv, $param) = @_;
+  $::lxdebug->enter_sub;
+  my ($iconv, $from, $to) = @_;
 
-  if (any { ref $param eq $_ } qw(Form HASH)) {
-    foreach my $key (keys %{ $param }) {
-      if (!ref $param->{$key}) {
-        # Workaround for a bug: converting $param->{$key} directly
+  if (any { ref $from eq $_ } qw(Form HASH)) {
+    for my $key (keys %{ $from }) {
+      if (!ref $from->{$key}) {
+        # Workaround for a bug: converting $from->{$key} directly
         # leads to 'undef'. I don't know why. Converting a copy works,
         # though.
-        $param->{$key} = $iconv->convert("" . $param->{$key});
+        $to->{$key} = $iconv->convert("" . $from->{$key});
       } else {
-        _recode_recursively($iconv, $param->{$key});
+        $to->{$key} = {} if 'HASH'  eq ref $from->{$key};
+        $to->{$key} = [] if 'ARRAY' eq ref $from->{$key};
+        _recode_recursively($iconv, $from->{$key}, $to->{$key});
       }
     }
 
-  } elsif (ref $param eq 'ARRAY') {
-    foreach my $idx (0 .. scalar(@{ $param }) - 1) {
-      if (!ref $param->[$idx]) {
-        # Workaround for a bug: converting $param->[$idx] directly
+  } elsif (ref $from eq 'ARRAY') {
+    foreach my $idx (0 .. scalar(@{ $from }) - 1) {
+      if (!ref $from->[$idx]) {
+        # Workaround for a bug: converting $from->[$idx] directly
         # leads to 'undef'. I don't know why. Converting a copy works,
         # though.
-        $param->[$idx] = $iconv->convert("" . $param->[$idx]);
+        $from->[$idx] = $iconv->convert("" . $from->[$idx]);
       } else {
-        _recode_recursively($iconv, $param->[$idx]);
+        $to->[$idx] = {} if 'HASH'  eq ref $from->[$idx];
+        $to->[$idx] = [] if 'ARRAY' eq ref $from->[$idx];
+        _recode_recursively($iconv, $from->[$idx], $to->[$idx]);
       }
     }
   }
@@ -173,29 +180,49 @@ sub read_cgi_input {
   $::lxdebug->enter_sub;
 
   my ($target) = @_;
+  my $db_charset   = $::lx_office_conf{system}->{dbcharset} || Common::DEFAULT_CHARSET;
+
+  # yes i know, copying all those values around isn't terribly efficient, but
+  # the old version of dumping everything into form and then launching a
+  # tactical recode nuke at the data is still worse.
 
-  _input_to_hash($target, $ENV{QUERY_STRING}) if $ENV{QUERY_STRING};
-  _input_to_hash($target, $ARGV[0])           if @ARGV && $ARGV[0];
+  # this way the data can at least be recoded on the fly as soon as we get to
+  # know the source encoding and only in the cases where encoding may be hidden
+  # among the payload we take the hit of copying the request around
+  my $temp_target = { };
+
+  # since both of these can potentially bring their encoding in INPUT_ENCODING
+  # they get dumped into temp_target
+  _input_to_hash($temp_target, $ENV{QUERY_STRING}) if $ENV{QUERY_STRING};
+  _input_to_hash($temp_target, $ARGV[0])           if @ARGV && $ARGV[0];
 
-  my $uploads;
   if ($ENV{CONTENT_LENGTH}) {
     my $content;
     read STDIN, $content, $ENV{CONTENT_LENGTH};
-    $uploads = _request_to_hash($target, $content);
+    open my $fh, '>:raw', '/tmp/blubb.bin' or die;
+    print $fh $content;
+    close $fh;
+    if ($ENV{'CONTENT_TYPE'} && $ENV{'CONTENT_TYPE'} =~ /multipart\/form-data/) {
+      # multipart formdata can bring it's own encoding, so give it both
+      # and let ti decide on it's own
+      _parse_multipart_formdata($target, $temp_target, $content);
+    } else {
+      # normal encoding must be recoded
+      _input_to_hash($temp_target, $content);
+    }
   }
 
   if ($target->{RESTORE_FORM_FROM_SESSION_ID}) {
     my %temp_form;
     $::auth->restore_form_from_session(delete $target->{RESTORE_FORM_FROM_SESSION_ID}, form => \%temp_form);
-    _input_to_hash($target, join '&', map { uri_encode($_) . '=' . uri_encode($temp_form{$_}) } keys %temp_form);
+    _store_value($target, $_, $temp_form{$_}) for keys %temp_form;
   }
 
-  my $db_charset   = $::lx_office_conf{system}->{dbcharset} || Common::DEFAULT_CHARSET;
-  my $encoding     = delete $target->{INPUT_ENCODING} || $db_charset;
+  my $encoding     = delete $temp_target->{INPUT_ENCODING} || $db_charset;
 
-  _recode_recursively(SL::Iconv->new($encoding, $db_charset), $target);
+  _recode_recursively(SL::Iconv->new($encoding, $db_charset), $temp_target => $target) if keys %$target;
 
-  map { $target->{$_} = $uploads->{$_} } keys %{ $uploads } if $uploads;
+  map { $target->{$_} = $temp_target->{$_} } keys %{ $temp_target };
 
   $::lxdebug->leave_sub;
 
-- 
2.20.1