1 package SL::BackgroundJob::CreateOrUpdateFileFullTexts;
5 use parent qw(SL::BackgroundJob::Base);
8 use English qw( -no_match_vars );
9 use File::Slurp qw(read_file);
10 use List::MoreUtils qw(uniq);
12 use Unicode::Normalize qw();
15 use SL::DB::FileFullText;
18 my %extractor_by_mime_type = (
19 'application/pdf' => \&_pdf_to_strings,
20 'text/html' => \&_html_to_strings,
21 'text/plain' => \&_text_to_strings,
25 $_[0]->create_standard_job('7 * * * *'); # seven minutes after every hour
29 # If job does not throw an error,
30 # success in background_job_histories is 'success'.
31 # It is 'failure' otherwise.
33 # return value goes to result in background_job_histories
39 my $all_dbfiles = SL::DB::Manager::File->get_all;
41 foreach my $dbfile (@$all_dbfiles) {
42 next if $dbfile->full_text && (($dbfile->mtime || $dbfile->itime) <= ($dbfile->full_text->mtime || $dbfile->full_text->itime));
43 next if !defined $extractor_by_mime_type{$dbfile->mime_type};
46 if (!eval { $file_name = SL::File->get(dbfile => $dbfile)->get_file(); 1; }) {
47 $::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::run: get_file failed: " . $EVAL_ERROR);
51 my $text = $extractor_by_mime_type{$dbfile->mime_type}->($file_name);
53 if ($dbfile->full_text) {
54 $dbfile->full_text->update_attributes(full_text => $text);
56 SL::DB::FileFullText->new(file => $dbfile, full_text => $text)->save;
66 my @cmd = qw(pdftotext -enc UTF-8);
67 push @cmd, $file_name;
72 IPC::Run::run \@cmd, \undef, \$txt, \$err;
75 $::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::_pdf_to_text failed for '$file_name': " . ($CHILD_ERROR >> 8) . ": " . $err);
79 $txt = Encode::decode('utf-8-strict', $txt);
81 $txt =~ s{\p{WSpace}+}{ }g;
82 $txt = Unicode::Normalize::normalize('C', $txt);
83 $txt = join ' ' , uniq(split(' ', $txt));
88 sub _html_to_strings {
91 my $txt = read_file($file_name);
93 $txt = Encode::decode('utf-8-strict', $txt);
94 $txt = SL::HTML::Util::strip($txt);
96 $txt =~ s{\p{WSpace}+}{ }g;
97 $txt = Unicode::Normalize::normalize('C', $txt);
98 $txt = join ' ' , uniq(split(' ', $txt));
103 sub _text_to_strings {
104 my ($file_name) = @_;
106 my $txt = read_file($file_name);
108 $txt = Encode::decode('utf-8-strict', $txt);
110 $txt =~ s{\p{WSpace}+}{ }g;
111 $txt = Unicode::Normalize::normalize('C', $txt);
112 $txt = join ' ' , uniq(split(' ', $txt));