1 package SL::Helper::Csv;
9 use Params::Validate qw(:all);
11 use Rose::Object::MakeMethods::Generic scalar => [ qw(
12 file encoding sep_char quote_char escape_char header profile
13 numberformat dateformat ignore_unknown_columns strict_profile is_multiplexed
14 _io _csv _objects _parsed _data _errors all_cvar_configs case_insensitive_header
17 use SL::Helper::Csv::Dispatcher;
18 use SL::Helper::Csv::Error;
24 my %params = validate(@_, {
25 sep_char => { default => ';' },
26 quote_char => { default => '"' },
27 escape_char => { default => '"' },
28 header => { type => ARRAYREF, optional => 1 },
29 profile => { type => ARRAYREF, optional => 1 },
34 ignore_unknown_columns => 0,
36 case_insensitive_header => 0,
38 my $self = bless {}, $class;
40 $self->$_($params{$_}) for keys %params;
42 $self->_io(IO::File->new);
43 $self->_csv(Text::CSV_XS->new({
45 sep_char => $self->sep_char,
46 quote_char => $self->quote_char,
47 escape_char => $self->escape_char,
56 my ($self, %params) = @_;
59 return if ! $self->_check_multiplexed;
60 return if ! $self->_check_header;
61 return if ! $self->dispatcher->parse_profile;
62 return if ! $self->_parse_data;
73 my ($self, %params) = @_;
74 croak 'must parse first' unless $self->_parsed;
76 $self->_make_objects unless $self->_objects;
77 return wantarray ? @{ $self->_objects } : $self->_objects;
91 my ($self, %params) = @_;
93 $self->encoding($self->_guess_encoding) if !$self->encoding;
95 $self->_io->open($self->file, '<' . $self->_encode_layer)
96 or die "could not open file " . $self->file;
101 # check, if data is multiplexed and if all nessesary infos are given
102 sub _check_multiplexed {
103 my ($self, %params) = @_;
105 $self->is_multiplexed(0);
107 # If more than one profile is given, it is multiplexed.
108 if ($self->profile) {
109 my @profile = @{ $self->profile };
110 if (scalar @profile > 1) {
112 # Each profile needs a class and a row_ident
113 foreach my $p (@profile) {
114 if ( !defined $p->{class} || !defined $p->{row_ident} ) {
120 # If header is given, there need to be a header for each profile
121 if ($info_ok && $self->header) {
122 my @header = @{ $self->header };
123 if (scalar @profile != scalar @header) {
127 $self->is_multiplexed($info_ok);
132 # ok, if not multiplexed
137 my ($self, %params) = @_;
140 $header = $self->header;
142 my $n_header = ($self->is_multiplexed)? scalar @{ $self->profile } : 1;
143 foreach my $p_num (0..$n_header - 1) {
144 my $h = $self->_csv->getline($self->_io);
147 $self->_csv->error_input,
148 $self->_csv->error_diag,
152 push @{ $header }, $h;
156 # Special case: utf8 BOM.
157 # certain software (namely MS Office and notepad.exe insist on prefixing
158 # data with a discouraged but valid byte order mark
159 # if not removed, the first header field will not be recognized
161 foreach my $h (@{ $header }) {
162 if ($h && $h->[0] && $self->encoding =~ /utf-?8/i) {
163 $h->[0] =~ s/^\x{FEFF}//;
168 # check, if all header fields are parsed well
171 foreach my $h (@{ $header }) {
180 return unless $all_ok;
182 # Special case: human stupidity
183 # people insist that case sensitivity doesn't exist and try to enter all
184 # sorts of stuff. at this point we've got a profile (with keys that represent
185 # valid methods), and a header full of strings. if two of them match, the user
186 # mopst likely meant that field, so rewrite the header
187 if ($self->case_insensitive_header) {
188 die 'case_insensitive_header is only possible with profile' unless $self->profile;
190 keys %{ $self->profile || {} },
192 for my $name (@names) {
193 for my $i (0..$#$header) {
194 $header->[$i] = $name if lc $header->[$i] eq lc $name;
199 return $self->header($header);
203 my ($self, %params) = @_;
207 my $row = $self->_csv->getline($self->_io);
209 my $header = $self->_header_by_row($row);
210 $self->_csv->column_names(@{ $header });
212 @hr{@{ $header }} = @$row;
215 last if $self->_csv->eof;
216 # Text::CSV_XS 0.89 added record number to error_diag
217 if (qv(Text::CSV_XS->VERSION) >= qv('0.89')) {
219 $self->_csv->error_input,
220 $self->_csv->error_diag,
224 $self->_csv->error_input,
225 $self->_csv->error_diag,
226 $self->_io->input_line_number,
230 last if $self->_csv->eof;
233 $self->_data(\@data);
234 $self->_push_error(@errors);
240 my ($self, $row) = @_;
242 my @header = @{ $self->header };
243 if ($self->is_multiplexed) {
245 foreach my $profile (@{ $self->profile }) {
246 if (@{ $row }[0] eq $profile->{row_ident}) {
257 ':encoding(' . $_[0]->encoding . ')';
261 my ($self, %params) = @_;
264 local $::myconfig{numberformat} = $self->numberformat if $self->numberformat;
265 local $::myconfig{dateformat} = $self->dateformat if $self->dateformat;
267 for my $line (@{ $self->_data }) {
268 my $tmp_obj = $self->dispatcher->dispatch($line);
269 push @objs, $tmp_obj;
272 $self->_objects(\@objs);
276 my ($self, %params) = @_;
278 $self->{_dispatcher} ||= $self->_make_dispatcher;
281 sub _make_dispatcher {
282 my ($self, %params) = @_;
284 die 'need a header to make a dispatcher' unless $self->header;
286 return SL::Helper::Csv::Dispatcher->new($self);
289 sub _guess_encoding {
295 my ($self, @errors) = @_;
296 my @new_errors = ($self->errors, map { SL::Helper::Csv::Error->new(@$_) } @errors);
297 $self->_errors(\@new_errors);
309 SL::Helper::Csv - take care of csv file uploads
315 my $csv = SL::Helper::Csv->new(
316 file => \$::form->{upload_file},
317 encoding => 'utf-8', # undef means utf8
318 sep_char => ',', # default ';'
319 quote_char => '\'', # default '"'
320 escape_char => '"', # default '"'
321 header => [ [qw(id text sellprice word)] ], # see later
322 profile => [ { profile => { sellprice => 'sellprice_as_number'},
323 class => 'SL::DB::Part' } ],
326 my $status = $csv->parse;
327 my $hrefs = $csv->get_data;
328 my @objects = $csv->get_objects;
330 my @errors = $csv->errors;
336 Text::CSV offeres already good functions to get lines out of a csv file, but in
337 most cases you will want those line to be parsed into hashes or even objects,
338 so this model just skips ahead and gives you objects.
340 Its basic assumptions are:
344 =item You do know what you expect to be in that csv file.
346 This means first and foremost you have knowledge about encoding, number and
347 date format, csv parameters such as quoting and separation characters. You also
348 know what content will be in that csv and what L<Rose::DB> is responsible for
349 it. You provide valid header columns and their mapping to the objects.
351 =item You do NOT know if the csv provider yields to your expectations.
353 Stuff that does not work with what you expect should not crash anything, but
354 give you a hint what went wrong. As a result, if you remember to check for
355 errors after each step, you should be fine.
357 =item Data does not make sense. It's just data.
359 Almost all data imports have some type of constraints. Some data needs to be
360 unique, other data needs to be connected to existing data sets. This will not
361 happen here. You will receive a plain mapping of the data into the class tree,
366 This module can handle multiplexed data of different class types. In that case
367 multiple profiles with classes and row identifiers must be given. Multiple
368 headers may also be given or read from csv data. Data must contain the row
369 identifier in the first column and it's field name must be 'datatype'.
379 Standard constructor. You can use this to set most of the data.
383 Do the actual work. Will return true ($self actually) if success, undef if not.
387 Parse the data into objects and return those.
389 This method will return list or arrayref depending on context.
393 Returns an arrayref of the raw lines as hashrefs.
397 Return all errors that came up during parsing. See error handling for detailed
408 The file which contents are to be read. Can be a name of a physical file or a
409 scalar ref for memory data.
413 Encoding of the CSV file. Note that this module does not do any encoding
414 guessing. Know what your data is. Defaults to utf-8.
422 Same as in L<Text::CSV>
424 =item C<header> \@HEADERS
426 If given, it contains an ARRAYREF for each different class type (i.e. one
427 ARRAYREF if the data is only of one class type). These ARRAYREFS are the header
428 fields which are an array of columns. In this case the first lines are not used
429 as a header. Empty header fields will be ignored in objects.
431 If not given, headers are taken from the first n lines of data, where n is the
432 number of different class types.
436 classic data of one type:
437 [ [ 'name', 'street', 'zipcode', 'city' ] ]
439 multiplexed data with two different types
440 [ [ 'ordernumber', 'customer', 'transdate' ], [ 'partnumber', 'qty', 'sellprice' ] ]
442 =item C<profile> [{profile => \%ACCESSORS, class => class, row_ident => ri},]
444 This is an ARRAYREF to HASHREFs which may contain the keys C<profile>, C<class>
447 The C<profile> is a HASHREF which may be used to map header fields to custom
450 [ {profile => { listprice => listprice_as_number }} ]
452 In this case C<listprice_as_number> will be used to read in values from the
455 In case of a One-To-One relationsship these can also be set over
456 relationsships by sparating the steps with a dot (C<.>). This will work:
458 [ {profile => { customer => 'customer.name' }} ]
460 And will result in something like this:
462 $obj->customer($obj->meta->relationship('customer')->class->new);
463 $obj->customer->name($csv_line->{customer})
465 But beware, this will not try to look up anything in the database. You will
466 simply receive objects that represent what the profile defined. If some of
467 these information are unique, and should be connected to preexisting data, you
468 will have to do that for yourself. Since you provided the profile, it is
469 assumed you know what to do in this case.
471 If C<class> is present, the line will be handed to the new sub of this class,
472 and the return value used instead of the line itself.
474 C<row_ident> is a string to recognize the right profile and class for each data
475 line in multiplexed data.
477 In case of multiplexed data, C<class> and C<row_ident> must be given.
480 class => 'SL::DB::Order',
484 class => 'SL::DB::OrderItem',
486 profile => {sellprice => sellprice_as_number}
489 =item C<ignore_unknown_columns>
491 If set, the import will ignore unkown header columns. Useful for lazy imports,
492 but deactivated by default.
494 =item C<case_insensitive_header>
496 If set, header columns will be matched against profile entries case
497 insensitive, and on match the profile name will be taken.
499 Only works if a profile is given, will die otherwise.
501 If both C<case_insensitive_header> and C<strict_profile> is set, matched header
502 columns will be accepted.
504 =item C<strict_profile>
506 If set, all columns to be parsed must be specified in C<profile>. Every header
507 field not listed there will be treated like an unknown column.
509 If both C<case_insensitive_header> and C<strict_profile> is set, matched header
510 columns will be accepted.
514 =head1 ERROR HANDLING
516 After parsing a file all errors will be accumulated into C<errors>.
517 Each entry is an object with the following attributes:
519 raw_input: offending raw input,
520 code: Text::CSV error code if Text:CSV signalled an error, 0 else,
521 diag: error diagnostics,
522 line: position in line,
523 col: estimated line in file,
525 Note that the last entry can be off, but will give an estimate.
533 sep_char, quote_char, and escape_char are passed to Text::CSV on creation.
534 Changing them later has no effect currently.
538 Encoding errors are not dealt with properly.
544 Dispatch to child objects, like this:
546 $csv = SL::Helper::Csv->new(
559 class => SL::DB::Part,
565 Sven Schöling E<lt>s.schoeling@linet-services.deE<gt>