my %stripper;
my %entities = (
- 'lt' => '<',
- 'gt' => '>',
- 'amp' => '&',
+ 'lt' => '<',
+ 'gt' => '>',
+ 'amp' => '&',
+ 'nbsp' => ' ', # should be => "\x{00A0}", but this can lead to problems with
+ # a non-visible character in csv-exports for example
);
sub strip {
my $value = !ref($class_or_value) && (($class_or_value // '') eq 'SL::HTML::Util') ? $_[1] : $class_or_value;
+ return '' unless defined $value;
+
+ # Remove HTML comments.
+ $value =~ s{ <!-- .*? --> }{}gx;
+
if (!%stripper) {
%stripper = ( parser => HTML::Parser->new );
- $stripper{parser}->handler(text => sub { $stripper{text} .= $_[1]; });
+ $stripper{parser}->handler(text => sub { $stripper{text} .= ' ' . $_[1]; });
}
$stripper{text} = '';
$stripper{parser}->parse($value);
$stripper{parser}->eof;
- $stripper{text} =~ s{\&([^;]+);}{ $entities{$1} }eg;
+ $stripper{text} =~ s{\&([^;]+);}{ $entities{$1} || "\&$1;" }eg;
+ $stripper{text} =~ s{^ +| +$}{}g;
+ $stripper{text} =~ s{ {2,}}{ }g;
return delete $stripper{text};
}
+sub plain_text_to_html {
+ my ($class_or_text) = @_;
+
+ my $text = !ref($class_or_text) && (($class_or_text // '') eq 'SL::HTML::Util') ? $_[1] : $class_or_text;
+
+ return $text if $text =~ m{^<p>.*</p>$};
+
+ $text =~ s{\r+}{}g;
+ $text =~ s{^[[:space:]]+|[[:space:]]+$}{}g;
+
+ return '' if $text eq '';
+
+ my @paragraphs;
+
+ foreach my $paragraph (split m{\n{2,}}, $text) {
+ no warnings 'once';
+ $paragraph = $::locale->quote_special_chars('HTML', $paragraph);
+ $paragraph =~ s{\n}{<br>}g;
+
+ push @paragraphs, $paragraph;
+ }
+
+ return '<p>' . join('</p><p>', @paragraphs) . '</p>';
+}
+
1;
__END__
Removes all HTML elements and tags from C<$html_content> and returns
the remaining plain text.
+=item C<plain_text_to_html $text>
+
+Converts a plain text to HTML: paragraphs will be recognized by empty
+lines; remaining newlines will be converted into forced line breaks;
+the rest will be HTML escaped.
+
=back
=head1 BUGS