package HTML::Form; use strict; use URI; use Carp (); use Encode (); use vars qw($VERSION); $VERSION = "6.03"; my %form_tags = map {$_ => 1} qw(input textarea button select option); my %type2class = ( text => "TextInput", password => "TextInput", hidden => "TextInput", textarea => "TextInput", "reset" => "IgnoreInput", radio => "ListInput", checkbox => "ListInput", option => "ListInput", button => "SubmitInput", submit => "SubmitInput", image => "ImageInput", file => "FileInput", keygen => "KeygenInput", ); # The new HTML5 input types %type2class = (%type2class, map { $_ => 'TextInput' } qw( tel search url email datetime date month week time datetime-local number range color )); =head1 NAME HTML::Form - Class that represents an HTML form element =head1 SYNOPSIS use HTML::Form; $form = HTML::Form->parse($html, $base_uri); $form->value(query => "Perl"); use LWP::UserAgent; $ua = LWP::UserAgent->new; $response = $ua->request($form->click); =head1 DESCRIPTION Objects of the C class represents a single HTML CformE ... E/formE> instance. A form consists of a sequence of inputs that usually have names, and which can take on various values. The state of a form can be tweaked and it can then be asked to provide C objects that can be passed to the request() method of C. The following methods are available: =over 4 =item @forms = HTML::Form->parse( $html_document, $base_uri ) =item @forms = HTML::Form->parse( $html_document, base => $base_uri, %opt ) =item @forms = HTML::Form->parse( $response, %opt ) The parse() class method will parse an HTML document and build up C objects for each
element found. If called in scalar context only returns the first . Returns an empty list if there are no forms to be found. The required arguments is the HTML document to parse ($html_document) and the URI used to retrieve the document ($base_uri). The base URI is needed to resolve relative action URIs. The provided HTML document should be a Unicode string (or US-ASCII). By default HTML::Form assumes that the original document was UTF-8 encoded and thus encode forms that don't specify an explicit I as UTF-8. The charset assumed can be overridden by providing the C option to parse(). It's a good idea to be explicit about this parameter as well, thus the recommended simplest invocation becomes: my @forms = HTML::Form->parse( Encode::decode($encoding, $html_document_bytes), base => $base_uri, charset => $encoding, ); If the document was retrieved with LWP then the response object provide methods to obtain a proper value for C and C: my $ua = LWP::UserAgent->new; my $response = $ua->get("http://www.example.com/form.html"); my @forms = HTML::Form->parse($response->decoded_content, base => $response->base, charset => $response->content_charset, ); In fact, the parse() method can parse from an C object directly, so the example above can be more conveniently written as: my $ua = LWP::UserAgent->new; my $response = $ua->get("http://www.example.com/form.html"); my @forms = HTML::Form->parse($response); Note that any object that implements a decoded_content(), base() and content_charset() method with similar behaviour as C will do. Additional options might be passed in to control how the parse method behaves. The following are all the options currently recognized: =over =item C<< base => $uri >> This is the URI used to retrive the original document. This option is not optional ;-) =item C<< charset => $str >> Specify what charset the original document was encoded in. This is used as the default for accept_charset. If not provided this defaults to "UTF-8". =item C<< verbose => $bool >> Warn (print messages to STDERR) about any bad HTML form constructs found. You can trap these with $SIG{__WARN__}. The default is not to issue warnings. =item C<< strict => $bool >> Initialize any form objects with the given strict attribute. If the strict is turned on the methods that change values of the form will croak if you try to set illegal values or modify readonly fields. The default is not to be strict. =back =cut sub parse { my $class = shift; my $html = shift; unshift(@_, "base") if @_ == 1; my %opt = @_; require HTML::TokeParser; my $p = HTML::TokeParser->new(ref($html) ? $html->decoded_content(ref => 1) : \$html); die "Failed to create HTML::TokeParser object" unless $p; my $base_uri = delete $opt{base}; my $charset = delete $opt{charset}; my $strict = delete $opt{strict}; my $verbose = delete $opt{verbose}; if ($^W) { Carp::carp("Unrecognized option $_ in HTML::Form->parse") for sort keys %opt; } unless (defined $base_uri) { if (ref($html)) { $base_uri = $html->base; } else { Carp::croak("HTML::Form::parse: No \$base_uri provided"); } } unless (defined $charset) { if (ref($html) and $html->can("content_charset")) { $charset = $html->content_charset; } unless ($charset) { $charset = "UTF-8"; } } my @forms; my $f; # current form my %openselect; # index to the open instance of a select while (my $t = $p->get_tag) { my($tag,$attr) = @$t; if ($tag eq "form") { my $action = delete $attr->{'action'}; $action = "" unless defined $action; $action = URI->new_abs($action, $base_uri); $f = $class->new($attr->{'method'}, $action, $attr->{'enctype'}); $f->accept_charset($attr->{'accept-charset'}) if $attr->{'accept-charset'}; $f->{default_charset} = $charset; $f->{attr} = $attr; $f->strict(1) if $strict; %openselect = (); push(@forms, $f); my(%labels, $current_label); while (my $t = $p->get_tag) { my($tag, $attr) = @$t; last if $tag eq "/form"; if ($tag ne 'textarea') { # if we are inside a label tag, then keep # appending any text to the current label if(defined $current_label) { $current_label = join " ", grep { defined and length } $current_label, $p->get_phrase; } } if ($tag eq "input") { $attr->{value_name} = exists $attr->{id} && exists $labels{$attr->{id}} ? $labels{$attr->{id}} : defined $current_label ? $current_label : $p->get_phrase; } if ($tag eq "label") { $current_label = $p->get_phrase; $labels{ $attr->{for} } = $current_label if exists $attr->{for}; } elsif ($tag eq "/label") { $current_label = undef; } elsif ($tag eq "input") { my $type = delete $attr->{type} || "text"; $f->push_input($type, $attr, $verbose); } elsif ($tag eq "button") { my $type = delete $attr->{type} || "submit"; $f->push_input($type, $attr, $verbose); } elsif ($tag eq "textarea") { $attr->{textarea_value} = $attr->{value} if exists $attr->{value}; my $text = $p->get_text("/textarea"); $attr->{value} = $text; $f->push_input("textarea", $attr, $verbose); } elsif ($tag eq "select") { # rename attributes reserved to come for the option tag for ("value", "value_name") { $attr->{"select_$_"} = delete $attr->{$_} if exists $attr->{$_}; } # count this new select option separately my $name = $attr->{name}; $name = "" unless defined $name; $openselect{$name}++; while ($t = $p->get_tag) { my $tag = shift @$t; last if $tag eq "/select"; next if $tag =~ m,/?optgroup,; next if $tag eq "/option"; if ($tag eq "option") { my %a = %{$t->[0]}; # rename keys so they don't clash with %attr for (keys %a) { next if $_ eq "value"; $a{"option_$_"} = delete $a{$_}; } while (my($k,$v) = each %$attr) { $a{$k} = $v; } $a{value_name} = $p->get_trimmed_text; $a{value} = delete $a{value_name} unless defined $a{value}; $a{idx} = $openselect{$name}; $f->push_input("option", \%a, $verbose); } else { warn("Bad here, so we # try to do the same. Actually the MSIE behaviour # appears really strange: and