package HTML::TreeBuilder;
# ABSTRACT: Parser that builds a HTML syntax tree
use warnings;
use strict;
use integer; # vroom vroom!
use Carp ();
our $VERSION = '5.07'; # VERSION from OurPkgVersion
#---------------------------------------------------------------------------
# Make a 'DEBUG' constant...
our $DEBUG; # Must be set BEFORE loading this file
BEGIN {
# We used to have things like
# print $indent, "lalala" if $Debug;
# But there were an awful lot of having to evaluate $Debug's value.
# If we make that depend on a constant, like so:
# sub DEBUG () { 1 } # or whatever value.
# ...
# print $indent, "lalala" if DEBUG;
# Which at compile-time (thru the miracle of constant folding) turns into:
# print $indent, "lalala";
# or, if DEBUG is a constant with a true value, then that print statement
# is simply optimized away, and doesn't appear in the target code at all.
# If you don't believe me, run:
# perl -MO=Deparse,-uHTML::TreeBuilder -e 'BEGIN { \
# $HTML::TreeBuilder::DEBUG = 4} use HTML::TreeBuilder'
# and see for yourself (substituting whatever value you want for $DEBUG
# there).
## no critic
if ( defined &DEBUG ) {
# Already been defined! Do nothing.
}
elsif ( $] < 5.00404 ) {
# Grudgingly accomodate ancient (pre-constant) versions.
eval 'sub DEBUG { $Debug } ';
}
elsif ( !$DEBUG ) {
eval 'sub DEBUG () {0}'; # Make it a constant.
}
elsif ( $DEBUG =~ m<^\d+$>s ) {
eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant.
}
else { # WTF?
warn "Non-numeric value \"$DEBUG\" in \$HTML::Element::DEBUG";
eval 'sub DEBUG () { $DEBUG }'; # I guess.
}
## use critic
}
#---------------------------------------------------------------------------
use HTML::Entities ();
use HTML::Tagset 3.02 ();
use HTML::Element ();
use HTML::Parser 3.46 ();
our @ISA = qw(HTML::Element HTML::Parser);
# This looks schizoid, I know.
# It's not that we ARE an element AND a parser.
# We ARE an element, but one that knows how to handle signals
# (method calls) from Parser in order to elaborate its subtree.
# Legacy aliases:
*HTML::TreeBuilder::isKnown = \%HTML::Tagset::isKnown;
*HTML::TreeBuilder::canTighten = \%HTML::Tagset::canTighten;
*HTML::TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement;
*HTML::TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement;
*HTML::TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup;
*HTML::TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement;
*HTML::TreeBuilder::isList = \%HTML::Tagset::isList;
*HTML::TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement;
*HTML::TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement;
*HTML::TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers;
#==========================================================================
# Two little shortcut constructors:
sub new_from_file { # or from a FH
my $class = shift;
Carp::croak("new_from_file takes only one argument")
unless @_ == 1;
Carp::croak("new_from_file is a class method only")
if ref $class;
my $new = $class->new();
defined $new->parse_file( $_[0] )
or Carp::croak("unable to parse file: $!");
return $new;
}
sub new_from_content { # from any number of scalars
my $class = shift;
Carp::croak("new_from_content is a class method only")
if ref $class;
my $new = $class->new();
foreach my $whunk (@_) {
if ( ref($whunk) eq 'SCALAR' ) {
$new->parse($$whunk);
}
else {
$new->parse($whunk);
}
last if $new->{'_stunted'}; # might as well check that.
}
$new->eof();
return $new;
}
sub new_from_url { # should accept anything that LWP does.
undef our $lwp_response;
my $class = shift;
Carp::croak("new_from_url takes only one argument")
unless @_ == 1;
Carp::croak("new_from_url is a class method only")
if ref $class;
my $url = shift;
my $new = $class->new();
require LWP::UserAgent;
# RECOMMEND PREREQ: LWP::UserAgent 5.815
LWP::UserAgent->VERSION( 5.815 ); # HTTP::Headers content_is_html method
$lwp_response = LWP::UserAgent->new->get( $url );
Carp::croak("GET failed on $url: " . $lwp_response->status_line)
unless $lwp_response->is_success;
Carp::croak("$url returned " . $lwp_response->content_type . " not HTML")
unless $lwp_response->content_is_html;
$new->parse( $lwp_response->decoded_content );
$new->eof;
undef $lwp_response; # Processed successfully
return $new;
}
# TODO: document more fully?
sub parse_content { # from any number of scalars
my $tree = shift;
my $retval;
foreach my $whunk (@_) {
if ( ref($whunk) eq 'SCALAR' ) {
$retval = $tree->parse($$whunk);
}
else {
$retval = $tree->parse($whunk);
}
last if $tree->{'_stunted'}; # might as well check that.
}
$tree->eof();
return $retval;
}
#---------------------------------------------------------------------------
sub new { # constructor!
my $class = shift;
$class = ref($class) || $class;
# Initialize HTML::Element part
my $self = $class->element_class->new('html');
{
# A hack for certain strange versions of Parser:
my $other_self = HTML::Parser->new();
%$self = ( %$self, %$other_self ); # copy fields
# Yes, multiple inheritance is messy. Kids, don't try this at home.
bless $other_self, "HTML::TreeBuilder::_hideyhole";
# whack it out of the HTML::Parser class, to avoid the destructor
}
# The root of the tree is special, as it has these funny attributes,
# and gets reblessed into this class.
# Initialize parser settings
$self->{'_implicit_tags'} = 1;
$self->{'_implicit_body_p_tag'} = 0;
# If true, trying to insert text, or any of %isPhraseMarkup right
# under 'body' will implicate a 'p'. If false, will just go there.
$self->{'_tighten'} = 1;
# whether ignorable WS in this tree should be deleted
$self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag
$self->{'_ignore_unknown'} = 1;
$self->{'_ignore_text'} = 0;
$self->{'_warn'} = 0;
$self->{'_no_space_compacting'} = 0;
$self->{'_store_comments'} = 0;
$self->{'_store_declarations'} = 1;
$self->{'_store_pis'} = 0;
$self->{'_p_strict'} = 0;
$self->{'_no_expand_entities'} = 0;
# Parse attributes passed in as arguments
if (@_) {
my %attr = @_;
for ( keys %attr ) {
$self->{"_$_"} = $attr{$_};
}
}
$HTML::Element::encoded_content = $self->{'_no_expand_entities'};
# rebless to our class
bless $self, $class;
$self->{'_element_count'} = 1;
# undocumented, informal, and maybe not exactly correct
$self->{'_head'} = $self->insert_element( 'head', 1 );
$self->{'_pos'} = undef; # pull it back up
$self->{'_body'} = $self->insert_element( 'body', 1 );
$self->{'_pos'} = undef; # pull it back up again
return $self;
}
#==========================================================================
sub _elem # universal accessor...
{
my ( $self, $elem, $val ) = @_;
my $old = $self->{$elem};
$self->{$elem} = $val if defined $val;
return $old;
}
# accessors....
sub implicit_tags { shift->_elem( '_implicit_tags', @_ ); }
sub implicit_body_p_tag { shift->_elem( '_implicit_body_p_tag', @_ ); }
sub p_strict { shift->_elem( '_p_strict', @_ ); }
sub no_space_compacting { shift->_elem( '_no_space_compacting', @_ ); }
sub ignore_unknown { shift->_elem( '_ignore_unknown', @_ ); }
sub ignore_text { shift->_elem( '_ignore_text', @_ ); }
sub ignore_ignorable_whitespace { shift->_elem( '_tighten', @_ ); }
sub store_comments { shift->_elem( '_store_comments', @_ ); }
sub store_declarations { shift->_elem( '_store_declarations', @_ ); }
sub store_pis { shift->_elem( '_store_pis', @_ ); }
sub warn { shift->_elem( '_warn', @_ ); }
sub no_expand_entities {
shift->_elem( '_no_expand_entities', @_ );
$HTML::Element::encoded_content = @_;
}
#==========================================================================
sub warning {
my $self = shift;
CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'};
# should maybe say HTML::TreeBuilder instead
}
#==========================================================================
{
# To avoid having to rebuild these lists constantly...
my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)];
my $indent;
sub start {
return if $_[0]{'_stunted'};
# Accept a signal from HTML::Parser for start-tags.
my ( $self, $tag, $attr ) = @_;
# Parser passes more, actually:
# $self->start($tag, $attr, $attrseq, $origtext)
# But we can merrily ignore $attrseq and $origtext.
if ( $tag eq 'x-html' ) {
print "Ignoring open-x-html tag.\n" if DEBUG;
# inserted by some lame code-generators.
return; # bypass tweaking.
}
$tag =~ s{/$}{}s; # So turns into . Silently forgive.
unless ( $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) {
DEBUG and print "Start-tag name $tag is no good. Skipping.\n";
return;
# This avoids having Element's new() throw an exception.
}
my $ptag = ( my $pos = $self->{'_pos'} || $self )->{'_tag'};
my $already_inserted;
#my($indent);
if (DEBUG) {
# optimization -- don't figure out indenting unless we're in debug mode
my @lineage = $pos->lineage;
$indent = ' ' x ( 1 + @lineage );
print $indent, "Proposing a new \U$tag\E under ",
join( '/', map $_->{'_tag'}, reverse( $pos, @lineage ) )
|| 'Root',
".\n";
#} else {
# $indent = ' ';
}
#print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2;
# $attr = {%$attr};
foreach my $k ( keys %$attr ) {
# Make sure some stooge doesn't have "".
# That happens every few million Web pages.
$attr->{ ' ' . $k } = delete $attr->{$k}
if length $k and substr( $k, 0, 1 ) eq '_';
# Looks bad, but is fine for round-tripping.
}
my $e = $self->element_class->new( $tag, %$attr );
# Make a new element object.
# (Only rarely do we end up just throwing it away later in this call.)
# Some prep -- custom messiness for those damned tables, and strict P's.
if ( $self->{'_implicit_tags'} ) { # wallawallawalla!
unless ( $HTML::TreeBuilder::isTableElement{$tag} ) {
if ( $ptag eq 'table' ) {
print $indent,
" * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n"
if DEBUG > 1;
$self->insert_element( 'tr', 1 );
$pos = $self->insert_element( 'td', 1 )
; # yes, needs updating
}
elsif ( $ptag eq 'tr' ) {
print $indent,
" * Phrasal \U$tag\E right under TR makes an implicit TD\n"
if DEBUG > 1;
$pos = $self->insert_element( 'td', 1 )
; # yes, needs updating
}
$ptag = $pos->{'_tag'}; # yes, needs updating
}
# end of table-implication block.
# Now maybe do a little dance to enforce P-strictness.
# This seems like it should be integrated with the big
# "ALL HOPE..." block, further below, but that doesn't
# seem feasable.
if ( $self->{'_p_strict'}
and $HTML::TreeBuilder::isKnown{$tag}
and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} )
{
my $here = $pos;
my $here_tag = $ptag;
while (1) {
if ( $here_tag eq 'p' ) {
print $indent, " * Inserting $tag closes strict P.\n"
if DEBUG > 1;
$self->end( \q{p} );
# NB: same as \'q', but less confusing to emacs cperl-mode
last;
}
#print("Lasting from $here_tag\n"),
last
if $HTML::TreeBuilder::isKnown{$here_tag}
and
not $HTML::Tagset::is_Possible_Strict_P_Content{
$here_tag};
# Don't keep looking up the tree if we see something that can't
# be strict-P content.
$here_tag
= ( $here = $here->{'_parent'} || last )->{'_tag'};
} # end while
$ptag = ( $pos = $self->{'_pos'} || $self )
->{'_tag'}; # better update!
}
# end of strict-p block.
}
# And now, get busy...
#----------------------------------------------------------------------
if ( !$self->{'_implicit_tags'} ) { # bimskalabim
# do nothing
print $indent, " * _implicit_tags is off. doing nothing\n"
if DEBUG > 1;
#----------------------------------------------------------------------
}
elsif ( $HTML::TreeBuilder::isHeadOrBodyElement{$tag} ) {
if ( $pos->is_inside('body') ) { # all is well
print $indent,
" * ambilocal element \U$tag\E is fine under BODY.\n"
if DEBUG > 1;
}
elsif ( $pos->is_inside('head') ) {
print $indent,
" * ambilocal element \U$tag\E is fine under HEAD.\n"
if DEBUG > 1;
}
else {
# In neither head nor body! mmmmm... put under head?
if ( $ptag eq 'html' ) { # expected case
# TODO?? : would there ever be a case where _head would be
# absent from a tree that would ever be accessed at this
# point?
die "Where'd my head go?" unless ref $self->{'_head'};
if ( $self->{'_head'}{'_implicit'} ) {
print $indent,
" * ambilocal element \U$tag\E makes an implicit HEAD.\n"
if DEBUG > 1;
# or rather, points us at it.
$self->{'_pos'}
= $self->{'_head'}; # to insert under...
}
else {
$self->warning(
"Ambilocal element <$tag> not under HEAD or BODY!?"
);
# Put it under HEAD by default, I guess
$self->{'_pos'}
= $self->{'_head'}; # to insert under...
}
}
else {
# Neither under head nor body, nor right under html... pass thru?
$self->warning(
"Ambilocal element <$tag> neither under head nor body, nor right under html!?"
);
}
}
#----------------------------------------------------------------------
}
elsif ( $HTML::TreeBuilder::isBodyElement{$tag} ) {
# Ensure that we are within
if ( $ptag eq 'body' ) {
# We're good.
}
elsif (
$HTML::TreeBuilder::isBodyElement{$ptag} # glarg
and not $HTML::TreeBuilder::isHeadOrBodyElement{$ptag}
)
{
# Special case: Save ourselves a call to is_inside further down.
# If our $ptag is an isBodyElement element (but not an
# isHeadOrBodyElement element), then we must be under body!
print $indent, " * Inferring that $ptag is under BODY.\n",
if DEBUG > 3;
# I think this and the test for 'body' trap everything
# bodyworthy, except the case where the parent element is
# under an unknown element that's a descendant of body.
}
elsif ( $pos->is_inside('head') ) {
print $indent,
" * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n"
if DEBUG > 1;
$ptag = (
$pos = $self->{'_pos'}
= $self->{'_body'} # yes, needs updating
|| die "Where'd my body go?"
)->{'_tag'}; # yes, needs updating
}
elsif ( !$pos->is_inside('body') ) {
print $indent,
" * body-element \U$tag\E makes implicit BODY.\n"
if DEBUG > 1;
$ptag = (
$pos = $self->{'_pos'}
= $self->{'_body'} # yes, needs updating
|| die "Where'd my body go?"
)->{'_tag'}; # yes, needs updating
}
# else we ARE under body, so okay.
# Handle implicit endings and insert based on and position
# ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ...
if ( $tag eq 'p'
or $tag eq 'h1'
or $tag eq 'h2'
or $tag eq 'h3'
or $tag eq 'h4'
or $tag eq 'h5'
or $tag eq 'h6'
or $tag eq 'form'
# Hm, should