From 0532e721f1427c5d3a545975b6d7fe5b4fcc9873 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Bernd=20Ble=C3=9Fmann?= Date: Wed, 6 Apr 2022 11:22:57 +0200 Subject: [PATCH] Volltext-Suche: Hintergrund-Job zum Extrahieren von Texten aus Dokumenten --- .../CreateOrUpdateFileFullTexts.pm | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 SL/BackgroundJob/CreateOrUpdateFileFullTexts.pm diff --git a/SL/BackgroundJob/CreateOrUpdateFileFullTexts.pm b/SL/BackgroundJob/CreateOrUpdateFileFullTexts.pm new file mode 100644 index 000000000..fd281162d --- /dev/null +++ b/SL/BackgroundJob/CreateOrUpdateFileFullTexts.pm @@ -0,0 +1,117 @@ +package SL::BackgroundJob::CreateOrUpdateFileFullTexts; + +use strict; + +use parent qw(SL::BackgroundJob::Base); + +use Encode qw(decode); +use English qw( -no_match_vars ); +use File::Slurp qw(read_file); +use List::MoreUtils qw(uniq); +use IPC::Run qw(); +use Unicode::Normalize qw(); + +use SL::DB::File; +use SL::DB::FileFullText; +use SL::HTML::Util; + +my %extractor_by_mime_type = ( + 'application/pdf' => \&_pdf_to_strings, + 'text/html' => \&_html_to_strings, + 'text/plain' => \&_text_to_strings, +); + +sub create_job { + $_[0]->create_standard_job('7 * * * *'); # seven minutes after every hour +} + +# +# If job does not throw an error, +# success in background_job_histories is 'success'. +# It is 'failure' otherwise. +# +# return value goes to result in background_job_histories +# +sub run { + my $self = shift; + my $db_obj = shift; + + my $all_dbfiles = SL::DB::Manager::File->get_all; + + foreach my $dbfile (@$all_dbfiles) { + next if $dbfile->full_text && (($dbfile->mtime || $dbfile->itime) <= ($dbfile->full_text->mtime || $dbfile->full_text->itime)); + next if !defined $extractor_by_mime_type{$dbfile->mime_type}; + + my $file_name; + if (!eval { $file_name = SL::File->get(dbfile => $dbfile)->get_file(); 1; }) { + $::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::run: get_file failed: " . $EVAL_ERROR); + next; + } + + my $text = $extractor_by_mime_type{$dbfile->mime_type}->($file_name); + + if ($dbfile->full_text) { + $dbfile->full_text->update_attributes(full_text => $text); + } else { + SL::DB::FileFullText->new(file => $dbfile, full_text => $text)->save; + } + } + + return 'ok'; +} + +sub _pdf_to_strings { + my ($file_name) = @_; + + my @cmd = qw(pdftotext -enc UTF-8); + push @cmd, $file_name; + push @cmd, '-'; + + my ($txt, $err); + + IPC::Run::run \@cmd, \undef, \$txt, \$err; + + if ($CHILD_ERROR) { + $::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::_pdf_to_text failed for '$file_name': " . ($CHILD_ERROR >> 8) . ": " . $err); + return ''; + } + + $txt = Encode::decode('utf-8-strict', $txt); + $txt =~ s{\r}{ }g; + $txt =~ s{\p{WSpace}+}{ }g; + $txt = Unicode::Normalize::normalize('C', $txt); + $txt = join ' ' , uniq(split(' ', $txt)); + + return $txt; +} + +sub _html_to_strings { + my ($file_name) = @_; + + my $txt = read_file($file_name); + + $txt = Encode::decode('utf-8-strict', $txt); + $txt = SL::HTML::Util::strip($txt); + $txt =~ s{\r}{ }g; + $txt =~ s{\p{WSpace}+}{ }g; + $txt = Unicode::Normalize::normalize('C', $txt); + $txt = join ' ' , uniq(split(' ', $txt)); + + return $txt; +} + +sub _text_to_strings { + my ($file_name) = @_; + + my $txt = read_file($file_name); + + $txt = Encode::decode('utf-8-strict', $txt); + $txt =~ s{\r}{ }g; + $txt =~ s{\p{WSpace}+}{ }g; + $txt = Unicode::Normalize::normalize('C', $txt); + $txt = join ' ' , uniq(split(' ', $txt)); + + return $txt; +} + +1; -- 2.20.1