9 use List::Util qw(first);
12 use constant RES_OK => 0;
13 use constant RES_ERR_FILE_OPEN => 1;
14 use constant RES_ERR_NO_XMP_METADATA => 2;
15 use constant RES_ERR_NO_XML_INVOICE => 3;
16 use constant RES_ERR_NOT_ZUGFERD => 4;
17 use constant RES_ERR_UNSUPPORTED_ZUGFERD_VERSION => 5;
19 sub _extract_zugferd_invoice_xml {
21 my $names_dict = $doc->getValue($doc->getRootDict->{Names}) or return {};
22 my $files_tree = $names_dict->{EmbeddedFiles} or return {};
23 my @agenda = $files_tree;
26 # Hardly ever more than single leaf, but...
29 my $item = $doc->getValue(shift @agenda);
32 my $kids = $doc->getValue($item->{Kids});
36 my $nodes = $doc->getValue($item->{Names});
37 my @names = map { $doc->getValue($_)} @$nodes;
40 my ($k, $v) = splice @names, 0, 2;
41 my $ef_node = $v->{EF};
42 my $ef_dict = $doc->getValue($ef_node);
43 my $fnode = (values %$ef_dict)[0];
44 my $any_num = $fnode->{value};
45 my $obj_node = $doc->dereference($any_num);
46 my $content = $doc->decodeOne($obj_node->{value}, 0) // '';
50 next if $content !~ m{<rsm:CrossIndustryInvoice};
53 my $dom = eval { XML::LibXML->load_xml(string => $content) };
54 return $content if $dom && ($dom->documentElement->nodeName eq 'rsm:CrossIndustryInvoice');
62 sub _get_xmp_metadata {
65 my $node = $doc->getValue($doc->getRootDict->{Metadata});
66 if ($node && $node->{StreamData} && defined($node->{StreamData}->{value})) {
67 return $node->{StreamData}->{value};
73 sub extract_from_pdf {
74 my ($self, $file_name) = @_;
76 my $pdf_doc = CAM::PDF->new($file_name);
80 result => RES_ERR_FILE_OPEN(),
81 message => $::locale->text('The file \'#1\' could not be opened for reading.', $file_name),
85 my $xmp = _get_xmp_metadata($pdf_doc);
88 result => RES_ERR_NO_XMP_METADATA(),
89 message => $::locale->text('The file \'#1\' does not contain the required XMP meta data.', $file_name),
94 result => RES_ERR_NO_XMP_METADATA(),
95 message => $::locale->text('Parsing the XMP metadata failed.'),
98 my $dom = eval { XML::LibXML->load_xml(string => $xmp) };
100 return $bad if !$dom;
102 my $xpc = XML::LibXML::XPathContext->new($dom);
103 $xpc->registerNs('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#');
107 foreach my $node ($xpc->findnodes('/x:xmpmeta/rdf:RDF/rdf:Description')) {
108 my $ns = first { ref($_) eq 'XML::LibXML::Namespace' } $node->attributes;
111 if ($ns->getData =~ m{urn:zugferd:pdfa:CrossIndustryDocument:invoice:2p0}) {
112 $zugferd_version = '2p0';
116 if ($ns->getData =~ m{zugferd}i) {
117 $zugferd_version = 'unsupported';
122 if (!$zugferd_version) {
124 result => RES_ERR_NOT_ZUGFERD(),
125 message => $::locale->text('The XMP metadata does not declare the ZUGFeRD data.'),
129 if ($zugferd_version !~ m{^2p}) {
131 result => RES_ERR_UNSUPPORTED_ZUGFERD_VERSION(),
132 message => $::locale->text('The ZUGFeRD version used is not supported.'),
136 my $invoice_xml = _extract_zugferd_invoice_xml($pdf_doc);
138 if (!defined $invoice_xml) {
140 result => RES_ERR_NO_XML_INVOICE(),
141 message => $::locale->text('The ZUGFeRD XML invoice was not found.'),
147 metadata_xmp => $xmp,
148 invoice_xml => $invoice_xml,
162 SL::ZUGFeRD - Helper functions for dealing with PDFs containing ZUGFeRD invoice data
166 my $pdf = '/path/to/my.pdf';
167 my $info = SL::ZUGFeRD->extract_from_pdf($pdf);
169 if ($info->{result} != SL::ZUGFeRD::RES_OK()) {
170 # An error occurred; log message from parser:
171 $::lxdebug->message(LXDebug::DEBUG1(), "Could not extract ZUGFeRD data from $pdf: " . $info->{message});
175 # Parse & handle invoice XML:
176 my $dom = XML::LibXML->load_xml(string => $info->{invoice_xml});
183 =item C<extract_from_pdf> C<$file_name>
185 Opens an existing PDF in the file system and tries to extract ZUGFeRD
186 invoice data from it. First it'll parse the XMP metadata and look for
187 the ZUGFeRD declaration inside. If the declaration isn't found or the
188 declared version isn't 2p0, an error is returned.
190 Otherwise it'll continue to look through all embedded files in the
191 PDF. The first embedded XML file with a root node of
192 C<rsm:CrossCountryInvoice> will be returnd.
194 Always returns a hash ref containing the key C<result>, a number that
195 can be one of the following constants:
199 =item C<RES_OK> (0): parsing was OK; the returned hash will also
200 contain the keys C<xmp_metadata> and C<invoice_xml> which will contain
201 the XML text of the metadata & the ZUGFeRD invoice.
203 =item C<RES_ERR_…> (all values E<gt> 0): parsing failed; the hash will
204 also contain a key C<message> which contains a human-readable
205 information about what exactly failed.
217 Moritz Bunkus E<lt>m.bunkus@linet-services.deE<gt>