From a827a37dbc9fda429719d199613a930200820ee6 Mon Sep 17 00:00:00 2001 From: Moritz Bunkus Date: Mon, 2 Mar 2020 15:53:09 +0100 Subject: [PATCH] SL::ZUGFeRD: Funktionen zum Extrahieren von ZUGFeRD-XML-Daten aus PDFs --- SL/InstallationCheck.pm | 2 + SL/ZUGFeRD.pm | 219 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 221 insertions(+) create mode 100644 SL/ZUGFeRD.pm diff --git a/SL/InstallationCheck.pm b/SL/InstallationCheck.pm index a54a48877..b4596e7a2 100644 --- a/SL/InstallationCheck.pm +++ b/SL/InstallationCheck.pm @@ -19,6 +19,7 @@ BEGIN { { name => "parent", url => "http://search.cpan.org/~corion/", debian => 'libparent-perl' }, { name => "Algorithm::CheckDigits", url => "http://search.cpan.org/~mamawe/", debian => 'libalgorithm-checkdigits-perl' }, { name => "Archive::Zip", version => '1.16', url => "http://search.cpan.org/~phred/", debian => 'libarchive-zip-perl' }, + { name => "CAM::PDF", url => "https://metacpan.org/pod/CAM::PDF", debian => 'libcap-pdf-perl' }, { name => "CGI", version => '3.43', url => "http://search.cpan.org/~leejo/", debian => 'libcgi-pm-perl' }, # 4.09 is not core anymore (perl 5.20) { name => "Clone", url => "http://search.cpan.org/~rdf/", debian => 'libclone-perl' }, { name => "Config::Std", url => "http://search.cpan.org/~dconway/", debian => 'libconfig-std-perl' }, @@ -62,6 +63,7 @@ BEGIN { { name => "Text::Iconv", version => '1.2', url => "http://search.cpan.org/~mpiotr/", debian => 'libtext-iconv-perl' }, { name => "Text::Unidecode", url => "http://search.cpan.org/~sburke/", debian => 'libtext-unidecode-perl' }, { name => "URI", version => '1.35', url => "http://search.cpan.org/~gaas/", debian => 'liburi-perl' }, + { name => "XML::LibXML", url => "https://metacpan.org/pod/XML::LibXML", debian => 'libxml-libxml-perl' }, { name => "XML::Writer", version => '0.602', url => "http://search.cpan.org/~josephw/", debian => 'libxml-writer-perl' }, { name => "YAML", version => '0.62', url => "http://search.cpan.org/~ingy/", debian => 'libyaml-perl' }, ); diff --git a/SL/ZUGFeRD.pm b/SL/ZUGFeRD.pm new file mode 100644 index 000000000..d4d263565 --- /dev/null +++ b/SL/ZUGFeRD.pm @@ -0,0 +1,219 @@ +package SL::ZUGFeRD; + +use strict; +use warnings; +use utf8; + +use CAM::PDF; +use Data::Dumper; +use List::Util qw(first); +use XML::LibXML; + +use constant RES_OK => 0; +use constant RES_ERR_FILE_OPEN => 1; +use constant RES_ERR_NO_XMP_METADATA => 2; +use constant RES_ERR_NO_XML_INVOICE => 3; +use constant RES_ERR_NOT_ZUGFERD => 4; +use constant RES_ERR_UNSUPPORTED_ZUGFERD_VERSION => 5; + +sub _extract_zugferd_invoice_xml { + my $doc = shift; + my $names_dict = $doc->getValue($doc->getRootDict->{Names}) or return {}; + my $files_tree = $names_dict->{EmbeddedFiles} or return {}; + my @agenda = $files_tree; + my $ret = {}; + + # Hardly ever more than single leaf, but... + + while (@agenda) { + my $item = $doc->getValue(shift @agenda); + + if ($item->{Kids}) { + my $kids = $doc->getValue($item->{Kids}); + push @agenda, @$kids + + } else { + my $nodes = $doc->getValue($item->{Names}); + my @names = map { $doc->getValue($_)} @$nodes; + + while (@names) { + my ($k, $v) = splice @names, 0, 2; + my $ef_node = $v->{EF}; + my $ef_dict = $doc->getValue($ef_node); + my $fnode = (values %$ef_dict)[0]; + my $any_num = $fnode->{value}; + my $obj_node = $doc->dereference($any_num); + my $content = $doc->decodeOne($obj_node->{value}, 0) // ''; + + print "1\n"; + + next if $content !~ m{load_xml(string => $content) }; + return $content if $dom && ($dom->documentElement->nodeName eq 'rsm:CrossIndustryInvoice'); + } + } + } + + return undef; +} + +sub _get_xmp_metadata { + my ($doc) = @_; + + my $node = $doc->getValue($doc->getRootDict->{Metadata}); + if ($node && $node->{StreamData} && defined($node->{StreamData}->{value})) { + return $node->{StreamData}->{value}; + } + + return undef; +} + +sub extract_from_pdf { + my ($self, $file_name) = @_; + + my $pdf_doc = CAM::PDF->new($file_name); + + if (!$pdf_doc) { + return { + result => RES_ERR_FILE_OPEN(), + message => $::locale->text('The file \'#1\' could not be opened for reading.', $file_name), + }; + } + + my $xmp = _get_xmp_metadata($pdf_doc); + if (!defined $xmp) { + return { + result => RES_ERR_NO_XMP_METADATA(), + message => $::locale->text('The file \'#1\' does not contain the required XMP meta data.', $file_name), + }; + } + + my $bad = { + result => RES_ERR_NO_XMP_METADATA(), + message => $::locale->text('Parsing the XMP metadata failed.'), + }; + + my $dom = eval { XML::LibXML->load_xml(string => $xmp) }; + + return $bad if !$dom; + + my $xpc = XML::LibXML::XPathContext->new($dom); + $xpc->registerNs('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'); + + my $zugferd_version; + + foreach my $node ($xpc->findnodes('/x:xmpmeta/rdf:RDF/rdf:Description')) { + my $ns = first { ref($_) eq 'XML::LibXML::Namespace' } $node->attributes; + next unless $ns; + + if ($ns->getData =~ m{urn:zugferd:pdfa:CrossIndustryDocument:invoice:2p0}) { + $zugferd_version = '2p0'; + last; + } + + if ($ns->getData =~ m{zugferd}i) { + $zugferd_version = 'unsupported'; + last; + } + } + + if (!$zugferd_version) { + return { + result => RES_ERR_NOT_ZUGFERD(), + message => $::locale->text('The XMP metadata does not declare the ZUGFeRD data.'), + }; + } + + if ($zugferd_version !~ m{^2p}) { + return { + result => RES_ERR_UNSUPPORTED_ZUGFERD_VERSION(), + message => $::locale->text('The ZUGFeRD version used is not supported.'), + }; + } + + my $invoice_xml = _extract_zugferd_invoice_xml($pdf_doc); + + if (!defined $invoice_xml) { + return { + result => RES_ERR_NO_XML_INVOICE(), + message => $::locale->text('The ZUGFeRD XML invoice was not found.'), + }; + } + + return { + result => RES_OK(), + metadata_xmp => $xmp, + invoice_xml => $invoice_xml, + }; +} + +1; + +__END__ + +=pod + +=encoding utf8 + +=head1 NAME + +SL::ZUGFeRD - Helper functions for dealing with PDFs containing ZUGFeRD invoice data + +=head1 SYNOPSIS + + my $pdf = '/path/to/my.pdf'; + my $info = SL::ZUGFeRD->extract_from_pdf($pdf); + + if ($info->{result} != SL::ZUGFeRD::RES_OK()) { + # An error occurred; log message from parser: + $::lxdebug->message(LXDebug::DEBUG1(), "Could not extract ZUGFeRD data from $pdf: " . $info->{message}); + return; + } + + # Parse & handle invoice XML: + my $dom = XML::LibXML->load_xml(string => $info->{invoice_xml}); + + +=head1 FUNCTIONS + +=over 4 + +=item C C<$file_name> + +Opens an existing PDF in the file system and tries to extract ZUGFeRD +invoice data from it. First it'll parse the XMP metadata and look for +the ZUGFeRD declaration inside. If the declaration isn't found or the +declared version isn't 2p0, an error is returned. + +Otherwise it'll continue to look through all embedded files in the +PDF. The first embedded XML file with a root node of +C will be returnd. + +Always returns a hash ref containing the key C, a number that +can be one of the following constants: + +=over 4 + +=item C (0): parsing was OK; the returned hash will also +contain the keys C and C which will contain +the XML text of the metadata & the ZUGFeRD invoice. + +=item C (all values E 0): parsing failed; the hash will +also contain a key C which contains a human-readable +information about what exactly failed. + +=back + +=back + +=head1 BUGS + +Nothing here yet. + +=head1 AUTHOR + +Moritz Bunkus Em.bunkus@linet-services.deE + +=cut -- 2.20.1