package Mojo::DOM; use Mojo::Base -strict; use overload '@{}' => sub { shift->child_nodes }, '%{}' => sub { shift->attr }, bool => sub {1}, '""' => sub { shift->to_string }, fallback => 1; # "Fry: This snow is beautiful. I'm glad global warming never happened. # Leela: Actually, it did. But thank God nuclear winter canceled it out." use Carp 'croak'; use Mojo::Collection; use Mojo::DOM::CSS; use Mojo::DOM::HTML; use Scalar::Util 'weaken'; sub all_text { _text(_nodes(shift->tree), 1) } sub ancestors { _select($_[0]->_collect([$_[0]->_ancestors]), $_[1]) } sub append { shift->_add(1, @_) } sub append_content { shift->_content(1, 0, @_) } sub at { my $self = shift; return undef unless my $result = $self->_css->select_one(@_); return $self->_build($result, $self->xml); } sub attr { my $self = shift; # Hash my $tree = $self->tree; my $attrs = $tree->[0] ne 'tag' ? {} : $tree->[2]; return $attrs unless @_; # Get return $attrs->{$_[0]} unless @_ > 1 || ref $_[0]; # Set my $values = ref $_[0] ? $_[0] : {@_}; @$attrs{keys %$values} = values %$values; return $self; } sub child_nodes { $_[0]->_collect(_nodes($_[0]->tree)) } sub children { _select($_[0]->_collect(_nodes($_[0]->tree, 1)), $_[1]) } sub content { my $self = shift; my $type = $self->type; if ($type eq 'root' || $type eq 'tag') { return $self->_content(0, 1, @_) if @_; my $html = Mojo::DOM::HTML->new(xml => $self->xml); return join '', map { $html->tree($_)->render } @{_nodes($self->tree)}; } return $self->tree->[1] unless @_; $self->tree->[1] = shift; return $self; } sub descendant_nodes { $_[0]->_collect(_all(_nodes($_[0]->tree))) } sub find { $_[0]->_collect($_[0]->_css->select($_[1])) } sub following { _select($_[0]->_collect($_[0]->_siblings(1, 1)), $_[1]) } sub following_nodes { $_[0]->_collect($_[0]->_siblings(0, 1)) } sub matches { shift->_css->matches(@_) } sub namespace { my $self = shift; return undef if (my $tree = $self->tree)->[0] ne 'tag'; # Extract namespace prefix and search parents my $ns = $tree->[1] =~ /^(.*?):/ ? "xmlns:$1" : undef; for my $node ($tree, $self->_ancestors) { # Namespace for prefix my $attrs = $node->[2]; if ($ns) { $_ eq $ns and return $attrs->{$_} for keys %$attrs } # Namespace attribute elsif (defined $attrs->{xmlns}) { return $attrs->{xmlns} } } return undef; } sub new { my $class = shift; my $self = bless \Mojo::DOM::HTML->new, ref $class || $class; return @_ ? $self->parse(@_) : $self; } sub next { $_[0]->_maybe($_[0]->_siblings(1, 1, 0)) } sub next_node { $_[0]->_maybe($_[0]->_siblings(0, 1, 0)) } sub parent { my $self = shift; return undef if (my $tree = $self->tree)->[0] eq 'root'; return $self->_build(_parent($tree), $self->xml); } sub parse { shift->_delegate(parse => @_) } sub preceding { _select($_[0]->_collect($_[0]->_siblings(1, 0)), $_[1]) } sub preceding_nodes { $_[0]->_collect($_[0]->_siblings(0)) } sub prepend { shift->_add(0, @_) } sub prepend_content { shift->_content(0, 0, @_) } sub previous { $_[0]->_maybe($_[0]->_siblings(1, 0, -1)) } sub previous_node { $_[0]->_maybe($_[0]->_siblings(0, 0, -1)) } sub remove { shift->replace('') } sub replace { my ($self, $new) = @_; return $self->parse($new) if (my $tree = $self->tree)->[0] eq 'root'; return $self->_replace(_parent($tree), $tree, _nodes($self->_parse($new))); } sub root { my $self = shift; return $self unless my $tree = $self->_ancestors(1); return $self->_build($tree, $self->xml); } sub strip { my $self = shift; return $self if (my $tree = $self->tree)->[0] ne 'tag'; return $self->_replace($tree->[3], $tree, _nodes($tree)); } sub tag { my ($self, $tag) = @_; return undef if (my $tree = $self->tree)->[0] ne 'tag'; return $tree->[1] unless $tag; $tree->[1] = $tag; return $self; } sub tap { shift->Mojo::Base::tap(@_) } sub text { _text(_nodes(shift->tree), 0) } sub to_string { shift->_delegate('render') } sub tree { shift->_delegate(tree => @_) } sub type { shift->tree->[0] } sub val { my $self = shift; # "option" return $self->{value} // $self->text if (my $tag = $self->tag) eq 'option'; # "input" ("type=checkbox" and "type=radio") my $type = $self->{type} // ''; return $self->{value} // 'on' if $tag eq 'input' && ($type eq 'radio' || $type eq 'checkbox'); # "textarea", "input" or "button" return $tag eq 'textarea' ? $self->text : $self->{value} if $tag ne 'select'; # "select" my $v = $self->find('option:checked:not([disabled])') ->grep(sub { !$_->ancestors('optgroup[disabled]')->size })->map('val'); return exists $self->{multiple} ? $v->size ? $v->to_array : undef : $v->last; } sub with_roles { shift->Mojo::Base::with_roles(@_) } sub wrap { shift->_wrap(0, @_) } sub wrap_content { shift->_wrap(1, @_) } sub xml { shift->_delegate(xml => @_) } sub _add { my ($self, $offset, $new) = @_; return $self if (my $tree = $self->tree)->[0] eq 'root'; my $parent = _parent($tree); splice @$parent, _offset($parent, $tree) + $offset, 0, @{_link($parent, _nodes($self->_parse($new)))}; return $self; } sub _all { my $nodes = shift; @$nodes = map { $_->[0] eq 'tag' ? ($_, @{_all(_nodes($_))}) : ($_) } @$nodes; return $nodes; } sub _ancestors { my ($self, $root) = @_; return () unless my $tree = _parent($self->tree); my @ancestors; do { push @ancestors, $tree } while ($tree->[0] eq 'tag') && ($tree = $tree->[3]); return $root ? $ancestors[-1] : @ancestors[0 .. $#ancestors - 1]; } sub _build { shift->new->tree(shift)->xml(shift) } sub _collect { my ($self, $nodes) = (shift, shift // []); my $xml = $self->xml; return Mojo::Collection->new(map { $self->_build($_, $xml) } @$nodes); } sub _content { my ($self, $start, $offset, $new) = @_; my $tree = $self->tree; unless ($tree->[0] eq 'root' || $tree->[0] eq 'tag') { my $old = $self->content; return $self->content($start ? $old . $new : $new . $old); } $start = $start ? ($#$tree + 1) : _start($tree); $offset = $offset ? $#$tree : 0; splice @$tree, $start, $offset, @{_link($tree, _nodes($self->_parse($new)))}; return $self; } sub _css { Mojo::DOM::CSS->new(tree => shift->tree) } sub _delegate { my ($self, $method) = (shift, shift); return $$self->$method unless @_; $$self->$method(@_); return $self; } sub _link { my ($parent, $children) = @_; # Link parent to children for my $node (@$children) { my $offset = $node->[0] eq 'tag' ? 3 : 2; $node->[$offset] = $parent; weaken $node->[$offset]; } return $children; } sub _maybe { $_[1] ? $_[0]->_build($_[1], $_[0]->xml) : undef } sub _nodes { return () unless my $tree = shift; my @nodes = @$tree[_start($tree) .. $#$tree]; return shift() ? [grep { $_->[0] eq 'tag' } @nodes] : \@nodes; } sub _offset { my ($parent, $child) = @_; my $i = _start($parent); $_ eq $child ? last : $i++ for @$parent[$i .. $#$parent]; return $i; } sub _parent { $_[0]->[$_[0][0] eq 'tag' ? 3 : 2] } sub _parse { Mojo::DOM::HTML->new(xml => shift->xml)->parse(shift)->tree } sub _replace { my ($self, $parent, $child, $nodes) = @_; splice @$parent, _offset($parent, $child), 1, @{_link($parent, $nodes)}; return $self->parent; } sub _select { $_[1] ? $_[0]->grep(matches => $_[1]) : $_[0] } sub _siblings { my ($self, $tags, $tail, $i) = @_; return defined $i ? undef : [] if (my $tree = $self->tree)->[0] eq 'root'; my $nodes = _nodes(_parent($tree)); my $match = -1; defined($match++) and $_ eq $tree and last for @$nodes; if ($tail) { splice @$nodes, 0, $match + 1 } else { splice @$nodes, $match, ($#$nodes + 1) - $match } @$nodes = grep { $_->[0] eq 'tag' } @$nodes if $tags; return defined $i ? $i == -1 && !@$nodes ? undef : $nodes->[$i] : $nodes; } sub _start { $_[0][0] eq 'root' ? 1 : 4 } sub _text { my ($nodes, $all) = @_; my $text = ''; while (my $node = shift @$nodes) { my $type = $node->[0]; # Text if ($type eq 'text' || $type eq 'cdata' || $type eq 'raw') { $text .= $node->[1]; } # Nested tag elsif ($type eq 'tag' && $all) { unshift @$nodes, @{_nodes($node)} } } return $text; } sub _wrap { my ($self, $content, $new) = @_; return $self if (my $tree = $self->tree)->[0] eq 'root' && !$content; return $self if $tree->[0] ne 'root' && $tree->[0] ne 'tag' && $content; # Find innermost tag my $current; my $first = $new = $self->_parse($new); $current = $first while $first = _nodes($first, 1)->[0]; return $self unless $current; # Wrap content if ($content) { push @$current, @{_link($current, _nodes($tree))}; splice @$tree, _start($tree), $#$tree, @{_link($tree, _nodes($new))}; return $self; } # Wrap element $self->_replace(_parent($tree), $tree, _nodes($new)); push @$current, @{_link($current, [$tree])}; return $self; } 1; =encoding utf8 =head1 NAME Mojo::DOM - Minimalistic HTML/XML DOM parser with CSS selectors =head1 SYNOPSIS use Mojo::DOM; # Parse my $dom = Mojo::DOM->new('

Test

123

'); # Find say $dom->at('#b')->text; say $dom->find('p')->map('text')->join("\n"); say $dom->find('[id]')->map(attr => 'id')->join("\n"); # Iterate $dom->find('p[id]')->reverse->each(sub { say $_->{id} }); # Loop for my $e ($dom->find('p[id]')->each) { say $e->{id}, ':', $e->text; } # Modify $dom->find('div p')->last->append('

456

'); $dom->find(':not(p)')->map('strip'); # Render say "$dom"; =head1 DESCRIPTION L is a minimalistic and relaxed HTML/XML DOM parser with CSS selector support. It will even try to interpret broken HTML and XML, so you should not use it for validation. =head1 NODES AND ELEMENTS When we parse an HTML/XML fragment, it gets turned into a tree of nodes. Hello World! There are currently eight different kinds of nodes, C, C, C, C, C, C, C and C. Elements are nodes of the type C. root |- doctype (html) +- tag (html) |- tag (head) | +- tag (title) | +- raw (Hello) +- tag (body) +- text (World!) While all node types are represented as L objects, some methods like L and L only apply to elements. =head1 CASE-SENSITIVITY L defaults to HTML semantics, that means all tags and attribute names are lowercased and selectors need to be lowercase as well. # HTML semantics my $dom = Mojo::DOM->new('

Hi!

'); say $dom->at('p[id]')->text; If an XML declaration is found, the parser will automatically switch into XML mode and everything becomes case-sensitive. # XML semantics my $dom = Mojo::DOM->new('

Hi!

'); say $dom->at('P[ID]')->text; HTML or XML semantics can also be forced with the L method. # Force HTML semantics my $dom = Mojo::DOM->new->xml(0)->parse('

Hi!

'); say $dom->at('p[id]')->text; # Force XML semantics my $dom = Mojo::DOM->new->xml(1)->parse('

Hi!

'); say $dom->at('P[ID]')->text; =head1 METHODS L implements the following methods. =head2 all_text my $text = $dom->all_text; Extract text content from all descendant nodes of this element. # "foo\nbarbaz\n" $dom->parse("
foo\n

bar

baz\n
")->at('div')->all_text; =head2 ancestors my $collection = $dom->ancestors; my $collection = $dom->ancestors('div ~ p'); Find all ancestor elements of this node matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # List tag names of ancestor elements say $dom->ancestors->map('tag')->join("\n"); =head2 append $dom = $dom->append('

I ♥ Mojolicious!

'); Append HTML/XML fragment to this node (for all node types other than C). # "

Test

123

" $dom->parse('

Test

') ->at('h1')->append('

123

')->root; # "

Test 123

" $dom->parse('

Test

')->at('p') ->child_nodes->first->append(' 123')->root; =head2 append_content $dom = $dom->append_content('

I ♥ Mojolicious!

'); Append HTML/XML fragment (for C and C nodes) or raw content to this node's content. # "

Test123

" $dom->parse('

Test

') ->at('h1')->append_content('123')->root; # "
" $dom->parse('
') ->child_nodes->first->append_content('123 ')->root; # "

Test123

" $dom->parse('

Test

')->at('p')->append_content('123')->root; =head2 at my $result = $dom->at('div ~ p'); Find first descendant element of this element matching the CSS selector and return it as a L object, or C if none could be found. All selectors from L are supported. # Find first element with "svg" namespace definition my $namespace = $dom->at('[xmlns\:svg]')->{'xmlns:svg'}; =head2 attr my $hash = $dom->attr; my $foo = $dom->attr('foo'); $dom = $dom->attr({foo => 'bar'}); $dom = $dom->attr(foo => 'bar'); This element's attributes. # Remove an attribute delete $dom->attr->{id}; # Attribute without value $dom->attr(selected => undef); # List id attributes say $dom->find('*')->map(attr => 'id')->compact->join("\n"); =head2 child_nodes my $collection = $dom->child_nodes; Return a L object containing all child nodes of this element as L objects. # "

123

" $dom->parse('

Test123

')->at('p')->child_nodes->first->remove; # "" $dom->parse('123')->child_nodes->first; # " Test " $dom->parse('123')->child_nodes->last->content; =head2 children my $collection = $dom->children; my $collection = $dom->children('div ~ p'); Find all child elements of this element matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # Show tag name of random child element say $dom->children->shuffle->first->tag; =head2 content my $str = $dom->content; $dom = $dom->content('

I ♥ Mojolicious!

'); Return this node's content or replace it with HTML/XML fragment (for C and C nodes) or raw content. # "Test" $dom->parse('
Test
')->at('div')->content; # "

123

" $dom->parse('

Test

')->at('h1')->content('123')->root; # "

123

" $dom->parse('

Test

')->at('p')->content('123')->root; # "

" $dom->parse('

Test

')->at('h1')->content('')->root; # " Test " $dom->parse('
')->child_nodes->first->content; # "
456
" $dom->parse('
456
') ->at('div')->child_nodes->first->content(' 123 ')->root; =head2 descendant_nodes my $collection = $dom->descendant_nodes; Return a L object containing all descendant nodes of this element as L objects. # "

123

" $dom->parse('

123

') ->descendant_nodes->grep(sub { $_->type eq 'comment' }) ->map('remove')->first; # "

testtest

" $dom->parse('

123456

') ->at('p')->descendant_nodes->grep(sub { $_->type eq 'text' }) ->map(content => 'test')->first->root; =head2 find my $collection = $dom->find('div ~ p'); Find all descendant elements of this element matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # Find a specific element and extract information my $id = $dom->find('div')->[23]{id}; # Extract information from multiple elements my @headers = $dom->find('h1, h2, h3')->map('text')->each; # Count all the different tags my $hash = $dom->find('*')->reduce(sub { $a->{$b->tag}++; $a }, {}); # Find elements with a class that contains dots my @divs = $dom->find('div.foo\.bar')->each; =head2 following my $collection = $dom->following; my $collection = $dom->following('div ~ p'); Find all sibling elements after this node matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # List tags of sibling elements after this node say $dom->following->map('tag')->join("\n"); =head2 following_nodes my $collection = $dom->following_nodes; Return a L object containing all sibling nodes after this node as L objects. # "C" $dom->parse('

A

C')->at('p')->following_nodes->last->content; =head2 matches my $bool = $dom->matches('div ~ p'); Check if this element matches the CSS selector. All selectors from L are supported. # True $dom->parse('

A

')->at('p')->matches('.a'); $dom->parse('

A

')->at('p')->matches('p[class]'); # False $dom->parse('

A

')->at('p')->matches('.b'); $dom->parse('

A

')->at('p')->matches('p[id]'); =head2 namespace my $namespace = $dom->namespace; Find this element's namespace, or return C if none could be found. # Find namespace for an element with namespace prefix my $namespace = $dom->at('svg > svg\:circle')->namespace; # Find namespace for an element that may or may not have a namespace prefix my $namespace = $dom->at('svg > circle')->namespace; =head2 new my $dom = Mojo::DOM->new; my $dom = Mojo::DOM->new('I ♥ Mojolicious!'); Construct a new scalar-based L object and L HTML/XML fragment if necessary. =head2 next my $sibling = $dom->next; Return L object for next sibling element, or C if there are no more siblings. # "

123

" $dom->parse('

Test

123

')->at('h1')->next; =head2 next_node my $sibling = $dom->next_node; Return L object for next sibling node, or C if there are no more siblings. # "456" $dom->parse('

123456

') ->at('b')->next_node->next_node; # " Test " $dom->parse('

123456

') ->at('b')->next_node->content; =head2 parent my $parent = $dom->parent; Return L object for parent of this node, or C if this node has no parent. # "Test" $dom->parse('

Test

')->at('i')->parent; =head2 parse $dom = $dom->parse('I ♥ Mojolicious!'); Parse HTML/XML fragment with L. # Parse XML my $dom = Mojo::DOM->new->xml(1)->parse('I ♥ Mojolicious!'); =head2 preceding my $collection = $dom->preceding; my $collection = $dom->preceding('div ~ p'); Find all sibling elements before this node matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # List tags of sibling elements before this node say $dom->preceding->map('tag')->join("\n"); =head2 preceding_nodes my $collection = $dom->preceding_nodes; Return a L object containing all sibling nodes before this node as L objects. # "A" $dom->parse('A

C

')->at('p')->preceding_nodes->first->content; =head2 prepend $dom = $dom->prepend('

I ♥ Mojolicious!

'); Prepend HTML/XML fragment to this node (for all node types other than C). # "

Test

123

" $dom->parse('

123

') ->at('h2')->prepend('

Test

')->root; # "

Test 123

" $dom->parse('

123

') ->at('p')->child_nodes->first->prepend('Test ')->root; =head2 prepend_content $dom = $dom->prepend_content('

I ♥ Mojolicious!

'); Prepend HTML/XML fragment (for C and C nodes) or raw content to this node's content. # "

Test123

" $dom->parse('

123

') ->at('h2')->prepend_content('Test')->root; # "
" $dom->parse('
') ->child_nodes->first->prepend_content(' Test')->root; # "

123Test

" $dom->parse('

Test

')->at('p')->prepend_content('123')->root; =head2 previous my $sibling = $dom->previous; Return L object for previous sibling element, or C if there are no more siblings. # "

Test

" $dom->parse('

Test

123

')->at('h2')->previous; =head2 previous_node my $sibling = $dom->previous_node; Return L object for previous sibling node, or C if there are no more siblings. # "123" $dom->parse('

123456

') ->at('b')->previous_node->previous_node; # " Test " $dom->parse('

123456

') ->at('b')->previous_node->content; =head2 remove my $parent = $dom->remove; Remove this node and return L (for C nodes) or L. # "
" $dom->parse('

Test

')->at('h1')->remove; # "

456

" $dom->parse('

123456

') ->at('p')->child_nodes->first->remove->root; =head2 replace my $parent = $dom->replace('
I ♥ Mojolicious!
'); Replace this node with HTML/XML fragment and return L (for C nodes) or L. # "

123

" $dom->parse('

Test

')->at('h1')->replace('

123

'); # "

123

" $dom->parse('

Test

') ->at('p')->child_nodes->[0]->replace('123')->root; =head2 root my $root = $dom->root; Return L object for C node. =head2 strip my $parent = $dom->strip; Remove this element while preserving its content and return L. # "
Test
" $dom->parse('

Test

')->at('h1')->strip; =head2 tag my $tag = $dom->tag; $dom = $dom->tag('div'); This element's tag name. # List tag names of child elements say $dom->children->map('tag')->join("\n"); =head2 tap $dom = $dom->tap(sub {...}); Alias for L. =head2 text my $text = $dom->text; Extract text content from this element only (not including child elements). # "bar" $dom->parse("
foo

bar

baz
")->at('p')->text; # "foo\nbaz\n" $dom->parse("
foo\n

bar

baz\n
")->at('div')->text; =head2 to_string my $str = $dom->to_string; Render this node and its content to HTML/XML. # "Test" $dom->parse('
Test
')->at('div b')->to_string; =head2 tree my $tree = $dom->tree; $dom = $dom->tree(['root']); Document Object Model. Note that this structure should only be used very carefully since it is very dynamic. =head2 type my $type = $dom->type; This node's type, usually C, C, C, C, C, C, C or C. # "cdata" $dom->parse('')->child_nodes->first->type; # "comment" $dom->parse('')->child_nodes->first->type; # "doctype" $dom->parse('')->child_nodes->first->type; # "pi" $dom->parse('')->child_nodes->first->type; # "raw" $dom->parse('Test')->at('title')->child_nodes->first->type; # "root" $dom->parse('

Test

')->type; # "tag" $dom->parse('

Test

')->at('p')->type; # "text" $dom->parse('

Test

')->at('p')->child_nodes->first->type; =head2 val my $value = $dom->val; Extract value from form element (such as C