Parsing von multipart/formdata beschleuningt.
authorSven Schöling <s.schoeling@linet-services.de>
Wed, 8 Aug 2012 14:48:47 +0000 (16:48 +0200)
committerSven Schöling <s.schoeling@linet-services.de>
Wed, 8 Aug 2012 14:48:47 +0000 (16:48 +0200)
Die entsprechende Routine hatte einen bösen Fall von Shlemiel the Painter's
algorithm [1]. Dadurch wurden Fileuploads mit mehr als 20k Zeilen extrem
langsam. Binärdaten wie pdfs oder Bilder hat das nicht gestört, aber bei CSV
Imports hat eine 80k Zeilen Datei dann auch mal 2-5min gebraucht, nur um den
Request zu parsen.

Jetzt werden nur die Indizes geparst und hinterher direkt aus dem Request der
substr gezogen. Ausserdem endlich einen Testfall dafür eingebaut.

[1] http://en.wikipedia.org/wiki/Schlemiel_the_Painter%27s_algorithm

SL/Request.pm
t/request/post_multipart.t [new file with mode: 0644]
t/request/post_multipart_1 [new file with mode: 0644]

index 6466287..a8ae4c1 100644 (file)
@@ -53,6 +53,10 @@ sub _input_to_hash {
 sub _parse_multipart_formdata {
   my ($target, $temp_target, $input) = @_;
   my ($name, $filename, $headers_done, $content_type, $boundary_found, $need_cr, $previous, $p_attachment, $encoding, $transfer_encoding);
+  my $data_start = 0;
+
+  # teach substr and length to use good ol' bytes, not 'em fancy characters
+  use bytes;
 
   # We SHOULD honor encodings and transfer-encodings here, but as hard as I
   # looked I couldn't find a reasonably recent webbrowser that makes use of
@@ -63,12 +67,21 @@ sub _parse_multipart_formdata {
   $ENV{'CONTENT_TYPE'} =~ /multipart\/form-data\s*;\s*boundary\s*=\s*(.+)$/;
   my $boundary = '--' . $1;
 
+  my $index = 0;
+  my $line_length;
   foreach my $line (split m/\n/, $input) {
-    last if (($line eq "${boundary}--") || ($line eq "${boundary}--\r"));
+    $line_length = length $line;
+
+    if ($line =~ /^\Q$boundary\E(--)?\r?$/) {
+      my $last_boundary = $1;
+      my $data       =  substr $input, $data_start, $index - $data_start;
+      $data =~ s/\r?\n$//;
 
-    if (($line eq $boundary) || ($line eq "$boundary\r")) {
-      ${ $previous } =~ s|\r?\n$|| if $previous;
-      ${ $previous } =  Encode::decode($encoding, $$previous) if $previous && !$filename && !$transfer_encoding eq 'binary';
+      if ($previous && !$filename && $transfer_encoding && $transfer_encoding ne 'binary') {
+        ${ $previous } = Encode::decode($encoding, $data);
+      } else {
+        ${ $previous } = $data;
+      }
 
       undef $previous;
       undef $filename;
@@ -79,7 +92,7 @@ sub _parse_multipart_formdata {
       $need_cr        = 0;
       $encoding       = $::lx_office_conf{system}->{dbcharset} || Common::DEFAULT_CHARSET;
       $transfer_encoding = undef;
-
+      last if $last_boundary;
       next;
     }
 
@@ -90,6 +103,7 @@ sub _parse_multipart_formdata {
 
       if (!$line) {
         $headers_done = 1;
+        $data_start = $index + $line_length + 1;
         next;
       }
 
@@ -159,11 +173,10 @@ sub _parse_multipart_formdata {
 
     next unless $previous;
 
-    ${ $previous } .= "${line}\n";
+  } continue {
+    $index += $line_length + 1;
   }
 
-  ${ $previous } =~ s|\r?\n$|| if $previous;
-
   $::lxdebug->leave_sub(2);
 }
 
diff --git a/t/request/post_multipart.t b/t/request/post_multipart.t
new file mode 100644 (file)
index 0000000..767807b
--- /dev/null
@@ -0,0 +1,107 @@
+use strict;
+use utf8;
+
+use lib 't';
+use lib 'modules/fallback';
+BEGIN {
+  unshift @INC, 'modules/override';
+}
+
+use Support::TestSetup;
+use Test::More tests => 2;
+use Data::Dumper;
+require Test::Deep;
+use Encode;
+
+use SL::Request;
+
+Support::TestSetup::login();
+
+open my $fh, '<', 't/request/post_multipart_1' or die "can't load test";
+my $data = do { $/ = undef; <$fh> };
+
+my $t = {};
+my $tt = {};
+
+local $ENV{CONTENT_TYPE} = 'multipart/form-data; boundary=---------------------------23281168279961';
+SL::Request::_parse_multipart_formdata($t, $tt, $data);
+
+
+my $blob = Encode::encode('utf-8', qq|\x{feff}Stunde;Montag;Dienstag;Mittwoch;Donnerstag;Freitag
+1;Mathe;Deutsch;Englisch;Mathe;Kunst
+2;Sport;Französisch;Geschichte;Sport;Geschichte
+3;Sport;"Religion ev;kath";Kunst;;Kunst|);
+
+my $t_cmp = {
+          'profile' => {
+                       'name' => undef,
+                       'type' => undef
+                     },
+          'quote_char' => undef,
+          'file' => $blob,
+          'custom_sep_char' => undef,
+          'sep_char' => undef,
+          'settings' => {
+                        'article_number_policy' => undef,
+                        'sellprice_places' => undef,
+                        'charset' => undef,
+                        'apply_buchungsgruppe' => undef,
+                        'full_preview' => undef,
+                        'parts_type' => undef,
+                        'default_unit' => undef,
+                        'default_buchungsgruppe' => undef,
+                        'duplicates' => undef,
+                        'numberformat' => undef,
+                        'sellprice_adjustment_type' => undef,
+                        'shoparticle_if_missing' => undef,
+                        'sellprice_adjustment' => undef
+                      },
+          'custom_escape_char' => undef,
+          'action_test' => undef,
+          'custom_quote_char' => undef,
+          'escape_char' => undef,
+          'action' => undef
+        };
+$t_cmp->{ATTACHMENTS}{file}{data} =  \$t_cmp->{'file'};
+
+
+is_deeply $t, $t_cmp;
+
+is_deeply $tt,
+        {
+          'profile' => {
+                       'name' => '',
+                       'type' =>'parts',
+                     },
+          'file' => undef,
+          'quote_char' => 'quote',
+          'custom_sep_char' => '',
+          'sep_char' => 'semicolon',
+          'settings' => {
+                        'article_number_policy' => 'update_prices',
+                        'sellprice_places' => 2,
+                        'charset' => 'UTF-8',
+                        'apply_buchungsgruppe' => 'all',
+                        'full_preview' => '0',
+                        'parts_type' => 'part',
+                        'default_unit' => 'g',
+                        'default_buchungsgruppe' => '815',
+                        'duplicates' => 'no_check',
+                        'numberformat' => '1.000,00',
+                        'sellprice_adjustment_type' => 'percent',
+                        'shoparticle_if_missing' => '0',
+                        'sellprice_adjustment' =>'0'
+                      },
+          'custom_escape_char' => '',
+          'action_test' => 'Test und Vorschau',
+          'ATTACHMENTS' => {
+                           'file' => {
+                                     'filename' => 'from_wikipedia.csv'
+                                   }
+                         },
+          'custom_quote_char' => '',
+          'escape_char' => 'quote',
+          'action' => 'CsvImport/dispatch',
+          'FILENAME' => 'from_wikipedia.csv'
+        };
+
diff --git a/t/request/post_multipart_1 b/t/request/post_multipart_1
new file mode 100644 (file)
index 0000000..c3ced0f
--- /dev/null
@@ -0,0 +1,101 @@
+-----------------------------23281168279961
+Content-Disposition: form-data; name="action"
+
+CsvImport/dispatch
+-----------------------------23281168279961
+Content-Disposition: form-data; name="profile.type"
+
+parts
+-----------------------------23281168279961
+Content-Disposition: form-data; name="profile.name"
+
+
+-----------------------------23281168279961
+Content-Disposition: form-data; name="settings.numberformat"
+
+1.000,00
+-----------------------------23281168279961
+Content-Disposition: form-data; name="settings.charset"
+
+UTF-8
+-----------------------------23281168279961
+Content-Disposition: form-data; name="sep_char"
+
+semicolon
+-----------------------------23281168279961
+Content-Disposition: form-data; name="custom_sep_char"
+
+
+-----------------------------23281168279961
+Content-Disposition: form-data; name="quote_char"
+
+quote
+-----------------------------23281168279961
+Content-Disposition: form-data; name="custom_quote_char"
+
+
+-----------------------------23281168279961
+Content-Disposition: form-data; name="escape_char"
+
+quote
+-----------------------------23281168279961
+Content-Disposition: form-data; name="custom_escape_char"
+
+
+-----------------------------23281168279961
+Content-Disposition: form-data; name="settings.duplicates"
+
+no_check
+-----------------------------23281168279961
+Content-Disposition: form-data; name="settings.article_number_policy"
+
+update_prices
+-----------------------------23281168279961
+Content-Disposition: form-data; name="settings.sellprice_places"
+
+2
+-----------------------------23281168279961
+Content-Disposition: form-data; name="settings.sellprice_adjustment"
+
+0
+-----------------------------23281168279961
+Content-Disposition: form-data; name="settings.sellprice_adjustment_type"
+
+percent
+-----------------------------23281168279961
+Content-Disposition: form-data; name="settings.shoparticle_if_missing"
+
+0
+-----------------------------23281168279961
+Content-Disposition: form-data; name="settings.parts_type"
+
+part
+-----------------------------23281168279961
+Content-Disposition: form-data; name="settings.default_buchungsgruppe"
+
+815
+-----------------------------23281168279961
+Content-Disposition: form-data; name="settings.apply_buchungsgruppe"
+
+all
+-----------------------------23281168279961
+Content-Disposition: form-data; name="settings.default_unit"
+
+g
+-----------------------------23281168279961
+Content-Disposition: form-data; name="settings.full_preview"
+
+0
+-----------------------------23281168279961
+Content-Disposition: form-data; name="file"; filename="from_wikipedia.csv"
+Content-Type: text/comma-separated-values
+
+Stunde;Montag;Dienstag;Mittwoch;Donnerstag;Freitag
+1;Mathe;Deutsch;Englisch;Mathe;Kunst
+2;Sport;Französisch;Geschichte;Sport;Geschichte
+3;Sport;"Religion ev;kath";Kunst;;Kunst
+-----------------------------23281168279961
+Content-Disposition: form-data; name="action_test"
+
+Test und Vorschau
+-----------------------------23281168279961--