1 package SL::BackgroundJob::CreateOrUpdateFileFullTexts;
5 use parent qw(SL::BackgroundJob::Base);
8 use English qw( -no_match_vars );
9 use File::Slurp qw(read_file);
10 use List::MoreUtils qw(uniq);
12 use Unicode::Normalize qw();
15 use SL::DB::FileFullText;
18 my %extractor_by_mime_type = (
19 'application/pdf' => \&_pdf_to_strings,
20 'text/html' => \&_html_to_strings,
21 'text/plain' => \&_text_to_strings,
25 $_[0]->create_standard_job('20 3 * * *'); # # every day at 3:20 am
29 # If job does not throw an error,
30 # success in background_job_histories is 'success'.
31 # It is 'failure' otherwise.
33 # return value goes to result in background_job_histories
39 my $all_dbfiles = SL::DB::Manager::File->get_all;
41 foreach my $dbfile (@$all_dbfiles) {
42 next if $dbfile->full_text && (($dbfile->mtime || $dbfile->itime) <= ($dbfile->full_text->mtime || $dbfile->full_text->itime));
43 next if !defined $extractor_by_mime_type{$dbfile->mime_type};
46 if (!eval { $file_name = SL::File->get(dbfile => $dbfile)->get_file(); 1; }) {
47 $::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::run: get_file failed: " . $EVAL_ERROR);
51 my $text = $extractor_by_mime_type{$dbfile->mime_type}->($file_name);
53 if ($dbfile->full_text) {
54 $dbfile->full_text->update_attributes(full_text => $text);
56 SL::DB::FileFullText->new(file => $dbfile, full_text => $text)->save;
66 my @cmd = qw(pdftotext -enc UTF-8);
67 push @cmd, $file_name;
72 IPC::Run::run \@cmd, \undef, \$txt, \$err;
75 $::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::_pdf_to_text failed for '$file_name': " . ($CHILD_ERROR >> 8) . ": " . $err);
79 $txt = Encode::decode('utf-8-strict', $txt);
81 $txt =~ s{\p{WSpace}+}{ }g;
82 $txt = Unicode::Normalize::normalize('C', $txt);
83 $txt = join ' ' , uniq(split(' ', $txt));
88 sub _html_to_strings {
91 my $txt = read_file($file_name);
93 $txt = Encode::decode('utf-8-strict', $txt);
94 $txt = SL::HTML::Util::strip($txt);
96 $txt =~ s{\p{WSpace}+}{ }g;
97 $txt = Unicode::Normalize::normalize('C', $txt);
98 $txt = join ' ' , uniq(split(' ', $txt));
103 sub _text_to_strings {
104 my ($file_name) = @_;
106 my $txt = read_file($file_name);
108 $txt = Encode::decode('utf-8-strict', $txt);
110 $txt =~ s{\p{WSpace}+}{ }g;
111 $txt = Unicode::Normalize::normalize('C', $txt);
112 $txt = join ' ' , uniq(split(' ', $txt));
127 SL::BackgroundJob::CreateOrUpdateFileFullTexts - Extract text strings/words from
128 files in the DMS for full text search.
132 Search all documents in the files table and try to extract strings from them
133 and store the strings in the database.
135 Duplicate strings/words in one text are removed.
137 Strings are updated if the change or creation time of the document is newer than
142 Bernd Bleßmann E<lt>bernd@kivitendo-premium.deE<gt>