From dc3f6120f9bbacaa028e554d7fa71e481d4497b4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Sven=20Sch=C3=B6ling?= Date: Wed, 8 Aug 2012 16:48:47 +0200 Subject: [PATCH] Parsing von multipart/formdata beschleuningt. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Die entsprechende Routine hatte einen bösen Fall von Shlemiel the Painter's algorithm [1]. Dadurch wurden Fileuploads mit mehr als 20k Zeilen extrem langsam. Binärdaten wie pdfs oder Bilder hat das nicht gestört, aber bei CSV Imports hat eine 80k Zeilen Datei dann auch mal 2-5min gebraucht, nur um den Request zu parsen. Jetzt werden nur die Indizes geparst und hinterher direkt aus dem Request der substr gezogen. Ausserdem endlich einen Testfall dafür eingebaut. [1] http://en.wikipedia.org/wiki/Schlemiel_the_Painter%27s_algorithm --- SL/Request.pm | 29 +++++++--- t/request/post_multipart.t | 107 +++++++++++++++++++++++++++++++++++++ t/request/post_multipart_1 | 101 ++++++++++++++++++++++++++++++++++ 3 files changed, 229 insertions(+), 8 deletions(-) create mode 100644 t/request/post_multipart.t create mode 100644 t/request/post_multipart_1 diff --git a/SL/Request.pm b/SL/Request.pm index 64662875d..a8ae4c137 100644 --- a/SL/Request.pm +++ b/SL/Request.pm @@ -53,6 +53,10 @@ sub _input_to_hash { sub _parse_multipart_formdata { my ($target, $temp_target, $input) = @_; my ($name, $filename, $headers_done, $content_type, $boundary_found, $need_cr, $previous, $p_attachment, $encoding, $transfer_encoding); + my $data_start = 0; + + # teach substr and length to use good ol' bytes, not 'em fancy characters + use bytes; # We SHOULD honor encodings and transfer-encodings here, but as hard as I # looked I couldn't find a reasonably recent webbrowser that makes use of @@ -63,12 +67,21 @@ sub _parse_multipart_formdata { $ENV{'CONTENT_TYPE'} =~ /multipart\/form-data\s*;\s*boundary\s*=\s*(.+)$/; my $boundary = '--' . $1; + my $index = 0; + my $line_length; foreach my $line (split m/\n/, $input) { - last if (($line eq "${boundary}--") || ($line eq "${boundary}--\r")); + $line_length = length $line; + + if ($line =~ /^\Q$boundary\E(--)?\r?$/) { + my $last_boundary = $1; + my $data = substr $input, $data_start, $index - $data_start; + $data =~ s/\r?\n$//; - if (($line eq $boundary) || ($line eq "$boundary\r")) { - ${ $previous } =~ s|\r?\n$|| if $previous; - ${ $previous } = Encode::decode($encoding, $$previous) if $previous && !$filename && !$transfer_encoding eq 'binary'; + if ($previous && !$filename && $transfer_encoding && $transfer_encoding ne 'binary') { + ${ $previous } = Encode::decode($encoding, $data); + } else { + ${ $previous } = $data; + } undef $previous; undef $filename; @@ -79,7 +92,7 @@ sub _parse_multipart_formdata { $need_cr = 0; $encoding = $::lx_office_conf{system}->{dbcharset} || Common::DEFAULT_CHARSET; $transfer_encoding = undef; - + last if $last_boundary; next; } @@ -90,6 +103,7 @@ sub _parse_multipart_formdata { if (!$line) { $headers_done = 1; + $data_start = $index + $line_length + 1; next; } @@ -159,11 +173,10 @@ sub _parse_multipart_formdata { next unless $previous; - ${ $previous } .= "${line}\n"; + } continue { + $index += $line_length + 1; } - ${ $previous } =~ s|\r?\n$|| if $previous; - $::lxdebug->leave_sub(2); } diff --git a/t/request/post_multipart.t b/t/request/post_multipart.t new file mode 100644 index 000000000..767807b90 --- /dev/null +++ b/t/request/post_multipart.t @@ -0,0 +1,107 @@ +use strict; +use utf8; + +use lib 't'; +use lib 'modules/fallback'; +BEGIN { + unshift @INC, 'modules/override'; +} + +use Support::TestSetup; +use Test::More tests => 2; +use Data::Dumper; +require Test::Deep; +use Encode; + +use SL::Request; + +Support::TestSetup::login(); + +open my $fh, '<', 't/request/post_multipart_1' or die "can't load test"; +my $data = do { $/ = undef; <$fh> }; + +my $t = {}; +my $tt = {}; + +local $ENV{CONTENT_TYPE} = 'multipart/form-data; boundary=---------------------------23281168279961'; +SL::Request::_parse_multipart_formdata($t, $tt, $data); + + +my $blob = Encode::encode('utf-8', qq|\x{feff}Stunde;Montag;Dienstag;Mittwoch;Donnerstag;Freitag +1;Mathe;Deutsch;Englisch;Mathe;Kunst +2;Sport;Französisch;Geschichte;Sport;Geschichte +3;Sport;"Religion ev;kath";Kunst;;Kunst|); + +my $t_cmp = { + 'profile' => { + 'name' => undef, + 'type' => undef + }, + 'quote_char' => undef, + 'file' => $blob, + 'custom_sep_char' => undef, + 'sep_char' => undef, + 'settings' => { + 'article_number_policy' => undef, + 'sellprice_places' => undef, + 'charset' => undef, + 'apply_buchungsgruppe' => undef, + 'full_preview' => undef, + 'parts_type' => undef, + 'default_unit' => undef, + 'default_buchungsgruppe' => undef, + 'duplicates' => undef, + 'numberformat' => undef, + 'sellprice_adjustment_type' => undef, + 'shoparticle_if_missing' => undef, + 'sellprice_adjustment' => undef + }, + 'custom_escape_char' => undef, + 'action_test' => undef, + 'custom_quote_char' => undef, + 'escape_char' => undef, + 'action' => undef + }; +$t_cmp->{ATTACHMENTS}{file}{data} = \$t_cmp->{'file'}; + + +is_deeply $t, $t_cmp; + +is_deeply $tt, + { + 'profile' => { + 'name' => '', + 'type' =>'parts', + }, + 'file' => undef, + 'quote_char' => 'quote', + 'custom_sep_char' => '', + 'sep_char' => 'semicolon', + 'settings' => { + 'article_number_policy' => 'update_prices', + 'sellprice_places' => 2, + 'charset' => 'UTF-8', + 'apply_buchungsgruppe' => 'all', + 'full_preview' => '0', + 'parts_type' => 'part', + 'default_unit' => 'g', + 'default_buchungsgruppe' => '815', + 'duplicates' => 'no_check', + 'numberformat' => '1.000,00', + 'sellprice_adjustment_type' => 'percent', + 'shoparticle_if_missing' => '0', + 'sellprice_adjustment' =>'0' + }, + 'custom_escape_char' => '', + 'action_test' => 'Test und Vorschau', + 'ATTACHMENTS' => { + 'file' => { + 'filename' => 'from_wikipedia.csv' + } + }, + 'custom_quote_char' => '', + 'escape_char' => 'quote', + 'action' => 'CsvImport/dispatch', + 'FILENAME' => 'from_wikipedia.csv' + }; + diff --git a/t/request/post_multipart_1 b/t/request/post_multipart_1 new file mode 100644 index 000000000..c3ced0fc8 --- /dev/null +++ b/t/request/post_multipart_1 @@ -0,0 +1,101 @@ +-----------------------------23281168279961 +Content-Disposition: form-data; name="action" + +CsvImport/dispatch +-----------------------------23281168279961 +Content-Disposition: form-data; name="profile.type" + +parts +-----------------------------23281168279961 +Content-Disposition: form-data; name="profile.name" + + +-----------------------------23281168279961 +Content-Disposition: form-data; name="settings.numberformat" + +1.000,00 +-----------------------------23281168279961 +Content-Disposition: form-data; name="settings.charset" + +UTF-8 +-----------------------------23281168279961 +Content-Disposition: form-data; name="sep_char" + +semicolon +-----------------------------23281168279961 +Content-Disposition: form-data; name="custom_sep_char" + + +-----------------------------23281168279961 +Content-Disposition: form-data; name="quote_char" + +quote +-----------------------------23281168279961 +Content-Disposition: form-data; name="custom_quote_char" + + +-----------------------------23281168279961 +Content-Disposition: form-data; name="escape_char" + +quote +-----------------------------23281168279961 +Content-Disposition: form-data; name="custom_escape_char" + + +-----------------------------23281168279961 +Content-Disposition: form-data; name="settings.duplicates" + +no_check +-----------------------------23281168279961 +Content-Disposition: form-data; name="settings.article_number_policy" + +update_prices +-----------------------------23281168279961 +Content-Disposition: form-data; name="settings.sellprice_places" + +2 +-----------------------------23281168279961 +Content-Disposition: form-data; name="settings.sellprice_adjustment" + +0 +-----------------------------23281168279961 +Content-Disposition: form-data; name="settings.sellprice_adjustment_type" + +percent +-----------------------------23281168279961 +Content-Disposition: form-data; name="settings.shoparticle_if_missing" + +0 +-----------------------------23281168279961 +Content-Disposition: form-data; name="settings.parts_type" + +part +-----------------------------23281168279961 +Content-Disposition: form-data; name="settings.default_buchungsgruppe" + +815 +-----------------------------23281168279961 +Content-Disposition: form-data; name="settings.apply_buchungsgruppe" + +all +-----------------------------23281168279961 +Content-Disposition: form-data; name="settings.default_unit" + +g +-----------------------------23281168279961 +Content-Disposition: form-data; name="settings.full_preview" + +0 +-----------------------------23281168279961 +Content-Disposition: form-data; name="file"; filename="from_wikipedia.csv" +Content-Type: text/comma-separated-values + +Stunde;Montag;Dienstag;Mittwoch;Donnerstag;Freitag +1;Mathe;Deutsch;Englisch;Mathe;Kunst +2;Sport;Französisch;Geschichte;Sport;Geschichte +3;Sport;"Religion ev;kath";Kunst;;Kunst +-----------------------------23281168279961 +Content-Disposition: form-data; name="action_test" + +Test und Vorschau +-----------------------------23281168279961-- -- 2.20.1