--- /dev/null
+package SL::BackgroundJob::CreateOrUpdateFileFullTexts;
+
+use strict;
+
+use parent qw(SL::BackgroundJob::Base);
+
+use Encode qw(decode);
+use English qw( -no_match_vars );
+use File::Slurp qw(read_file);
+use List::MoreUtils qw(uniq);
+use IPC::Run qw();
+use Unicode::Normalize qw();
+
+use SL::DB::File;
+use SL::DB::FileFullText;
+use SL::HTML::Util;
+
+my %extractor_by_mime_type = (
+ 'application/pdf' => \&_pdf_to_strings,
+ 'text/html' => \&_html_to_strings,
+ 'text/plain' => \&_text_to_strings,
+);
+
+sub create_job {
+ $_[0]->create_standard_job('7 * * * *'); # seven minutes after every hour
+}
+
+#
+# If job does not throw an error,
+# success in background_job_histories is 'success'.
+# It is 'failure' otherwise.
+#
+# return value goes to result in background_job_histories
+#
+sub run {
+ my $self = shift;
+ my $db_obj = shift;
+
+ my $all_dbfiles = SL::DB::Manager::File->get_all;
+
+ foreach my $dbfile (@$all_dbfiles) {
+ next if $dbfile->full_text && (($dbfile->mtime || $dbfile->itime) <= ($dbfile->full_text->mtime || $dbfile->full_text->itime));
+ next if !defined $extractor_by_mime_type{$dbfile->mime_type};
+
+ my $file_name;
+ if (!eval { $file_name = SL::File->get(dbfile => $dbfile)->get_file(); 1; }) {
+ $::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::run: get_file failed: " . $EVAL_ERROR);
+ next;
+ }
+
+ my $text = $extractor_by_mime_type{$dbfile->mime_type}->($file_name);
+
+ if ($dbfile->full_text) {
+ $dbfile->full_text->update_attributes(full_text => $text);
+ } else {
+ SL::DB::FileFullText->new(file => $dbfile, full_text => $text)->save;
+ }
+ }
+
+ return 'ok';
+}
+
+sub _pdf_to_strings {
+ my ($file_name) = @_;
+
+ my @cmd = qw(pdftotext -enc UTF-8);
+ push @cmd, $file_name;
+ push @cmd, '-';
+
+ my ($txt, $err);
+
+ IPC::Run::run \@cmd, \undef, \$txt, \$err;
+
+ if ($CHILD_ERROR) {
+ $::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::_pdf_to_text failed for '$file_name': " . ($CHILD_ERROR >> 8) . ": " . $err);
+ return '';
+ }
+
+ $txt = Encode::decode('utf-8-strict', $txt);
+ $txt =~ s{\r}{ }g;
+ $txt =~ s{\p{WSpace}+}{ }g;
+ $txt = Unicode::Normalize::normalize('C', $txt);
+ $txt = join ' ' , uniq(split(' ', $txt));
+
+ return $txt;
+}
+
+sub _html_to_strings {
+ my ($file_name) = @_;
+
+ my $txt = read_file($file_name);
+
+ $txt = Encode::decode('utf-8-strict', $txt);
+ $txt = SL::HTML::Util::strip($txt);
+ $txt =~ s{\r}{ }g;
+ $txt =~ s{\p{WSpace}+}{ }g;
+ $txt = Unicode::Normalize::normalize('C', $txt);
+ $txt = join ' ' , uniq(split(' ', $txt));
+
+ return $txt;
+}
+
+sub _text_to_strings {
+ my ($file_name) = @_;
+
+ my $txt = read_file($file_name);
+
+ $txt = Encode::decode('utf-8-strict', $txt);
+ $txt =~ s{\r}{ }g;
+ $txt =~ s{\p{WSpace}+}{ }g;
+ $txt = Unicode::Normalize::normalize('C', $txt);
+ $txt = join ' ' , uniq(split(' ', $txt));
+
+ return $txt;
+}
+
+1;