# Html.pm: Class Used for Objects to Manage HTML Data
package RDA::Object::Html;
# $Id: Html.pm,v 1.12 2015/04/29 13:55:39 RDA Exp $
# ARCS: $Header: /home/cvs/cvs/RDA_8/src/scripting/lib/RDA/Object/Html.pm,v 1.12 2015/04/29 13:55:39 RDA Exp $
#
# Change History
# 20150429 MSC Change tracing.
=head1 NAME
RDA::Object::Html - Class Used for Objects to Manage HTML Data
=head1 SYNOPSIS
require RDA::Object::Html;
=head1 DESCRIPTION
The objects of the C class are used to manage HTML data.
The following methods are available:
=cut
use strict;
BEGIN
{ use Exporter;
use RDA::Text qw(debug get_string);
use RDA::Object;
use RDA::Driver::Sgml;
}
# Define the global public variables
use vars qw($SHORT $STRINGS $VERSION @ISA %SDCL);
$SHORT = 1;
$VERSION = sprintf('%d.%02d', q$Revision: 1.12 $ =~ /(\d+)\.(\d+)/);
@ISA = qw(RDA::Driver::Sgml RDA::Object Exporter);
%SDCL = (
inc => [qw(RDA::Object)],
met => {
'decode' => {ret => 0},
'disable' => {ret => 0},
'encode' => {ret => 0},
'exists' => {ret => 0},
'filter' => {ret => 0},
'find' => {ret => 1},
'fix' => {ret => 0},
'get_attr' => {ret => 1},
'get_content' => {ret => 1},
'get_error' => {ret => 0},
'get_name' => {ret => 0},
'get_status' => {ret => 0},
'get_tables' => {ret => 1},
'get_text' => {ret => 0},
'get_type' => {ret => 0},
'get_value' => {ret => 0},
'parse' => {ret => 0},
'parse_command' => {ret => 0},
'parse_file' => {ret => 0},
'set_trace' => {ret => 0},
},
new => 1,
trc => 'HTML',
);
# Define the global private constants
my $SPC = q{ };
# Define the global private variables
my %tb_emp = map {$_ => 1} qw(area base basefont bgsound br col embed frame hr
img input isindex link meta param spacer wbr);
my %tb_par =(
'dd' => [qw(dl)],
'dt' => [qw(dl)],
'li' => [qw(dir menu ol ul)],
'option' => [qw(form select)],
'select' => [qw(form)],
'td' => [qw(table tr)],
'textarea' => [qw(form)],
'th' => [qw(table tr)],
'tr' => [qw(table)],
);
my %tb_phr = map {$_ => 1} qw(a abbr acronym b br basefont bdo big blink cite
code dfn em embed font i img kbd nobr noembed q
s samp small spacer span strike strong sub sup tt
u var wbr);
# Report the package version
sub Version
{ return $VERSION;
}
=head2 S<$h = RDA::Object::Html-Enew([$level])>
The object constructor.
C is represented by a blessed hash reference. Along with the
keys inherited from C, the following special keys are used
also:
=over 12
=item S > Auto fix HTML code indicator (false by default)
=item S > Tag filter hash
=back
=cut
sub new
{ my ($cls, $lvl) = @_;
# Return the object reference
return bless RDA::Driver::Sgml->new('H', 'HTML> ', $lvl), ref($cls) || $cls;
}
=head2 S<$h-Edump([$level[,$text[,$trace]]])>
This method returns a string containing the object dump. You can provide an
indentation level, a prefix text, and a trace indicator as extra parameters.
=cut
sub dump ## no critic (Builtin)
{ my ($slf, $lvl, $txt, $trc) = @_;
my ($flg, $pre);
$lvl = 0 unless defined($lvl);
$flg = $trc && $SHORT && $slf->{'-typ'} ne 'H';
$pre = q{ } x $lvl;
return ($txt)
? "$pre$txt\n".$slf->SUPER::dump(0, $pre.q{ }, $flg)
: $slf->SUPER::dump(0, $pre, $flg);
}
=head2 S<$h-Eeof>
This method signals the end of the document, flushing any remaining buffered
text. It returns a reference to the parser object.
=cut
sub eof ## no critic (Builtin)
{ my $slf = shift;
my $buf = \$slf->{'-buf'};
# Assume rest is text
if (length($$buf))
{ $slf->add_text($$buf);
$$buf = q{};
}
# Insert pending text
$slf->add_text;
# Return the object reference
return $slf;
}
=head2 S<$h-Efilter([$tag,...])>
This method specifies the list of the tags to consider when parsing the
document. It can automatically add additional tags to the list for resolving
optional end tags. When the list is empty, it disables any tag filtering.
It returns the parser object reference.
=cut
sub filter
{ my ($slf, @tag) = @_;
# Update the tag filter list
delete($slf->{'-tag'});
foreach my $tag (@tag)
{ $tag = lc($tag);
$slf->{'-tag'}->{$tag} = 1;
if (exists($tb_par{$tag}))
{ for (@{$tb_par{$tag}})
{ $slf->{'-tag'}->{$_} = 1;
}
}
}
# Return the object reference
return $slf;
}
=head2 S<$h-Efix([$flag])>
This method indicates that the parser can fix incorrect HTML code. It returns
the parser object reference.
=cut
sub fix
{ my ($slf, $flg) = @_;
# Update the indicator
$slf->{'-fix'} = $flg;
# Return the object reference
return $slf;
}
=head2 S<$h-Eget_tables([$level[,$format]])>
This method extracts all significant tables from the parsed document. Cells in
bold or containing C or C in their style name are taken as
headings. It converts single cell rows in the header of the specified level. It
considers horizontal rulers and header lines also. The method returns the
result as a list of raw data lines.
=cut
sub get_tables
{ my ($slf, $lvl, $fmt) = @_;
# Extract the table content
my $obj = {
fmt => $fmt ? {center => \&_fmt_center,
left => \&_fmt_left,
right => \&_fmt_right}
: {left => \&_fmt_left},
hdr => q{},
lin => [],
lvl => (defined($lvl) && $lvl > 1) ? q{---}.(q{+} x $lvl).$SPC : '---+ ',
rec => [],
txt => q{}};
$slf->traverse(\&_table_row, $obj);
# Return the extracted information
return @{$obj->{'lin'}};
}
sub _table_row
{ my ($slf, $lvl, $flg, $obj) = @_;
my ($tag);
if ($flg)
{ if ($slf->{'-typ'} eq 'T')
{ $tag = $slf->{'-nam'};
if ($tag eq 'tr')
{ $obj->{'hdr'} =
(exists($slf->{'class'}) && $slf->{'class'} =~ m/head(er|ing)/)
? q{*}
: q{};
$obj->{'rec'} = [];
$obj->{'txt'} = q{};
}
elsif ($tag =~ m/^(h\d|td|tr)$/)
{ $obj->{'txt'} = q{};
}
}
elsif ($slf->{'-typ'} eq 'S')
{ $obj->{'txt'} .= $slf->{'-dat'};
}
}
elsif ($slf->{'-typ'} eq 'T')
{ $tag = $slf->{'-nam'};
if ($tag eq 'td')
{ _table_text($obj, $slf->{'align'} || 'left', 1);
}
elsif ($tag eq 'tr')
{ _table_text($obj, 'left');
my $cnt = @{$obj->{'rec'}};
if ($cnt == 1)
{ my $txt = $obj->{'rec'}->[0];
push(@{$obj->{'lin'}}, $obj->{'lvl'}.$slf->encode($txt, 1))
if $txt =~ m/\S/;
}
if ($cnt > 1)
{ push(@{$obj->{'lin'}},
q{|}.join(q{|}, map{$slf->encode($_, 1)} @{$obj->{'rec'}}).q{|});
}
$obj->{'rec'} = [];
}
elsif ($tag eq 'b')
{ $obj->{'txt'} = q{*}.$obj->{'txt'}.q{*};
}
elsif ($tag =~ m/^h(\d)$/)
{ push(@{$obj->{'lin'}},
q{---}.(q{+} x $1).$SPC.$slf->encode($obj->{'txt'}, 1));
$obj->{'txt'} = q{};
}
elsif ($tag eq 'hr')
{ push(@{$obj->{'lin'}}, q{---});
}
}
return 1;
}
sub _table_text
{ my ($obj, $fmt, $flg) = @_;
my $txt;
if (length($txt = $obj->{'txt'}))
{ $fmt = 'left' unless exists($obj->{'fmt'}->{$fmt});
$txt =~ s/\240/ /g;
push(@{$obj->{'rec'}}, &{$obj->{'fmt'}->{$fmt}}($txt, $obj->{'hdr'}));
$obj->{'txt'} = q{};
}
elsif ($flg && @{$obj->{'rec'}})
{ push(@{$obj->{'rec'}}, $SPC);
}
return;
}
sub _fmt_center
{ my ($txt, $hdr) = @_;
$txt =~ s/^\s*/ $hdr/;
$txt =~ s/\s*$/$hdr /;
return $txt;
}
sub _fmt_left
{ my ($txt, $hdr) = @_;
$txt =~ s/^\s*/$hdr/;
$txt =~ s/\s*$/$hdr/;
return ($txt =~ m/^\S/) ? $txt : $SPC;
}
sub _fmt_right
{ my ($txt, $hdr) = @_;
$txt =~ s/^\s*/ $hdr/;
$txt =~ s/\s*$/$hdr/;
return $txt;
}
=head2 S<$h-Eget_text>
This method returns the text contained in the object.
=cut
sub get_text
{ my ($slf) = @_;
my $buf = q{};
$slf->traverse(\&_text, \$buf);
return $buf;
}
*get_data = \&get_text;
sub _text
{ my ($slf, $lvl, $flg, $buf) = @_;
my $typ = $slf->{'-typ'};
if ($typ eq 'S')
{ $$buf .= $slf->{'-dat'} if $flg;
}
elsif ($typ eq 'T')
{ $$buf .= $SPC
unless exists($tb_phr{$slf->{'-nam'}}) ## no critic (Unless)
&& $slf->{'-nam'} ne 'br';
}
return 1;
}
=head2 S<$h-Eparse($string)>;
This method parses the specified string as the next HTML chunk. It returns a
reference to the HTML object.
=cut
sub parse ## no critic (Complex,Unpack)
{ my $slf = shift;
my $buf = \$slf->{'-buf'};
my $dbg = $slf->{'-lvl'};
# When EOF, assume rest is text
return $slf->eof unless defined($_[0]);
# Transfer a trailing carriage return to the next buffer
$_[0] =~ s{^}{\r} if delete($slf->{'-crf'});
$slf->{'-crf'} = 1 if $_[0] =~ s{\r\z}{};
# Filter out some characters
$_[0] =~ s{\r\n}{\n}g;
$_[0] =~ s{\r}{\n}g;
# Parse HTML in the buffer
$$buf .= $_[0];
$slf->debug_buffer(q{HTML> ## New Buffer}) if $dbg;
TOKEN: while ($$buf !~ m{^(?:<\/|<\?|debug_buffer(q{HTML> Buffer}) if $dbg;
# Parse the next token
if ($$buf =~ s{^([^<]+)}{}) # Plain text
{ # Extract any text before '<' characters
$slf->add_text($1);
last TOKEN unless length($$buf);
}
elsif ($$buf =~ s{^()}{}s) ## no critic (Capture)
{ # Need more data to get all data
$$buf = $cur.$$buf;
last TOKEN;
}
$slf->add_item('R', -dat => $2); ## no critic (Capture)
}
elsif ($$buf =~ s{^(]*?)--)}{})
{ $cur .= $1;
$txt .= $2;
# Look for end of comment
if ($$buf =~ s{^((.*?)--)}{}s)
{ $cur .= $1;
push(@com, $2) if $2;
}
else
{ # Need more data to extract the comment
$$buf = $cur.$$buf;
last TOKEN;
}
}
# Try to finish the declaration extraction
if ($$buf =~ s{^([^>]*)>}{})
{ $txt .= $1;
$slf->add_item('D', -dat => $txt) if $txt;
foreach my $com (@com)
{ $slf->add_item('R', -dat => $com);
}
}
else
{ # Need more data to extract the declaration
$$buf = $cur.$$buf;
last TOKEN;
}
}
elsif ($$buf =~ s{^}{}) # End tag
{ if ($$buf =~ s{^(([a-zA-Z][a-zA-Z0-9\.\-]*)\s*>)}{})
{ # Close the tag
$slf->_end_tag(lc($2));
}
elsif ($$buf =~ m{^[a-zA-Z][a-zA-Z0-9\.\-]*\s*$})
{ # Need more data for the end tag
$$buf = ''.$$buf;
last TOKEN;
}
else
{ # When not valid, consider it as text
$slf->add_text(q{});
}
}
elsif ($$buf =~ s{^(<([a-zA-Z]+)\s*(\/)?>)}{}) # Empty start tag
{ my $tag = lc($2);
$slf->_add_tag($tag, {}, $3 || exists($tb_emp{$tag}));
}
elsif ($$buf =~ s{^<}{}) # Start Tag
{ my ($cur, $tag, %tbl);
$cur = q{<};
$slf->debug_buffer('HTML> ++ Start tag found') if $dbg;
if ($$buf =~ s{^(([a-zA-Z][a-zA-Z0-9\.\-]*)\s*)}{})
{ # Extract the tag name
$cur .= $1;
$tag = lc($2);
# Extract attributes
while ($$buf =~ s{^(([a-zA-Z][a-zA-Z0-9:_\.\-]*)\s*)}{})
{ my ($nam);
$cur .= $1;
$nam = lc($2);
if ($$buf =~ s{(^=\s*([^\042\047>\s][^>\s]*)\s*)}{})
{ # Extract attribute value (unquoted)
$cur .= $1;
$tbl{$nam} = $slf->decode($2);
}
elsif ($$buf =~ s{(^=\s*([\042\047])(.*?)\2\s*)}{}s)
{ # Extract attribute value (quoted)
$cur .= $1;
$tbl{$nam} = $slf->decode($3);
# truncated just after the '=' or inside the attribute
}
elsif ($$buf =~ m{^(=\s*)$} || $$buf =~ m{^(=\s*[\042\047].*)}s)
{ # Need more data to extract attribute
$$buf = "$cur$1";
last TOKEN;
}
else
{ # Extract attribute value (implicit value)
$tbl{$nam} = $nam;
}
}
# Check start tag end
if ($$buf =~ s{^(\/)?>}{})
{ # Insert the tag in the object tree
$slf->_add_tag($tag, \%tbl, $1 || exists($tb_emp{$tag}));
}
elsif ($$buf =~ m{^<} && $slf->{'-fix'})
{ # Insert the tag in the object tree
$slf->_add_tag($tag, \%tbl, $1 || exists($tb_emp{$tag}));
}
elsif (length($$buf) && $$buf !~ m{^\/$})
{ # Not a conforming HTML declaration, consider it as text
$slf->add_text($cur);
}
else
{ # Need more data to parse the start tag
$$buf = $cur.$$buf;
last TOKEN;
}
}
elsif (length($$buf))
{ # Not a valid start tag, consider it as text
$slf->add_text($cur);
}
else
{ # Need more data to parse the start tag
$$buf = $cur.$$buf;
last TOKEN;
}
}
}
# Return the object reference
return $slf;
}
sub _add_tag
{ my ($slf, $tag, $tbl, $flg) = @_;
# When requested, filter the tag
if (exists($slf->{'-flt'}->{'T'})
|| (exists($slf->{'-tag'}) && !exists($slf->{'-tag'}->{$tag})))
{ $slf->add_text($SPC)
unless exists($tb_phr{$tag}) && $tag ne 'br'; ## no critic (Unless)
return;
}
# Terminate the current paragraph
$slf->_end_p unless exists($tb_phr{$tag});
# Treat tags with optional end tag
if (exists($tb_par{$tag}))
{ # Insert pending text
$slf->add_text;
# Retrieve its parent tag
$slf->_find_parent($tag, $tb_par{$tag});
}
# Disable text normalization in PRE blocks
$tbl->{'-txt'} = 0 if $tag eq 'pre';
# Create the tag element and insert in the list
$slf->add_item('T', -nam => $tag, %{$tbl});
# Go to the next level when an end tag is expected
$slf->push_item unless $flg;
return;
}
sub _end_p
{ my ($slf) = @_;
my ($cur, $nam);
# Insert pending text
$slf->add_text;
# Close all phrase tags
debug($slf->{'-pre'}.q{** Close tag 'p'}) if $slf->{'-lvl'};
$cur = $slf->{'-cur'};
for (; index('HX', $cur->{'-typ'}) < 0 ## no critic (Loop)
; $cur = $slf->pop_item)
{ next unless $cur->{'-typ'} eq 'T';
$nam = $cur->{'-nam'};
if ($nam eq 'p')
{ $slf->pop_item;
return;
}
return unless exists($tb_phr{$cur->{'-nam'}});
}
return;
}
sub _end_tag
{ my ($slf, $tag) = @_;
my ($cur, $nam, $tbl);
# When requested, filter the tag
if (exists($slf->{'-flt'}->{'T'})
|| (exists($slf->{'-tag'}) && !exists($slf->{'-tag'}->{$tag})))
{ $slf->add_text($SPC)
unless exists($tb_phr{$tag}) && $tag ne 'br'; ## no critic (Unless)
return;
}
# Insert pending text
$slf->add_text;
# Close the tag
$slf->save_stack;
debug($slf->{'-pre'}.qq{** Close tag $tag}) if $slf->{'-lvl'};
$cur = $slf->{'-cur'};
$tbl = exists($tb_par{$tag}) ? $tb_par{$tag} : [];
LEVEL: for (; index('HX', $cur->{'-typ'}) < 0 ## no critic (Loop)
; $cur = $slf->pop_item)
{ next unless $cur->{'-typ'} eq 'T';
$nam = $cur->{'-nam'};
if ($nam eq $tag)
{ $slf->pop_item;
return;
}
for (@{$tbl})
{ last LEVEL if $nam eq $_;
}
}
# Ignore it when no corresponding tag has been found
$slf->restore_stack;
++$slf->{'-err'};
debug(qq{ERR> Missing tag '$tag' !}) if $slf->{'-lvl'};
return;
}
sub _find_parent
{ my ($slf, $tag, $tbl) = @_;
my ($cur, $nam);
# Close the tag
$slf->save_stack;
$cur = $slf->{'-cur'};
for (; index('HX', $cur->{'-typ'}) < 0 ## no critic (Loop)
; $cur = $slf->pop_item)
{ next unless $cur->{'-typ'} eq 'T';
$nam = $cur->{'-nam'};
for (@{$tbl})
{ return if $_ eq $nam;
}
}
# Ignore it when parent tag has not been found
$slf->restore_stack;
++$slf->{'-err'};
debug(qq{ERR> Missing parent tag for '$tag' !}) if $slf->{'-lvl'};
return;
}
1;
__END__
=head1 SEE ALSO
L,
L
L,
L,
L
=head1 COPYRIGHT NOTICE
Copyright (c) 2002, 2016, Oracle and/or its affiliates. All rights reserved.
=head1 TRADEMARK NOTICE
Oracle and Java are registered trademarks of Oracle and/or its
affiliates. Other names may be trademarks of their respective owners.
=cut