| 1 | package HTML::HeadParser; |
| 2 | |
| 3 | =head1 NAME |
| 4 | |
| 5 | HTML::HeadParser - Parse <HEAD> section of a HTML document |
| 6 | |
| 7 | =head1 SYNOPSIS |
| 8 | |
| 9 | require HTML::HeadParser; |
| 10 | $p = HTML::HeadParser->new; |
| 11 | $p->parse($text) and print "not finished"; |
| 12 | |
| 13 | $p->header('Title') # to access <title>....</title> |
| 14 | $p->header('Content-Base') # to access <base href="http://..."> |
| 15 | $p->header('Foo') # to access <meta http-equiv="Foo" content="..."> |
| 16 | |
| 17 | =head1 DESCRIPTION |
| 18 | |
| 19 | The I<HTML::HeadParser> is a specialized (and lightweight) |
| 20 | I<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD> |
| 21 | section of an HTML document. The parse() method |
| 22 | will return a FALSE value as soon as some E<lt>BODY> element or body |
| 23 | text are found, and should not be called again after this. |
| 24 | |
| 25 | The I<HTML::HeadParser> keeps a reference to a header object, and the |
| 26 | parser will update this header object as the various elements of the |
| 27 | E<lt>HEAD> section of the HTML document are recognized. The following |
| 28 | header fields are affected: |
| 29 | |
| 30 | =over 4 |
| 31 | |
| 32 | =item Content-Base: |
| 33 | |
| 34 | The I<Content-Base> header is initialized from the E<lt>base |
| 35 | href="..."> element. |
| 36 | |
| 37 | =item Title: |
| 38 | |
| 39 | The I<Title> header is initialized from the E<lt>title>...E<lt>/title> |
| 40 | element. |
| 41 | |
| 42 | =item Isindex: |
| 43 | |
| 44 | The I<Isindex> header will be added if there is a E<lt>isindex> |
| 45 | element in the E<lt>head>. The header value is initialized from the |
| 46 | I<prompt> attribute if it is present. If no I<prompt> attribute is |
| 47 | given it will have '?' as the value. |
| 48 | |
| 49 | =item X-Meta-Foo: |
| 50 | |
| 51 | All E<lt>meta> elements will initialize headers with the prefix |
| 52 | "C<X-Meta->" on the name. If the E<lt>meta> element contains a |
| 53 | C<http-equiv> attribute, then it will be honored as the header name. |
| 54 | |
| 55 | =back |
| 56 | |
| 57 | =head1 METHODS |
| 58 | |
| 59 | The following methods (in addition to those provided by the |
| 60 | superclass) are available: |
| 61 | |
| 62 | =over 4 |
| 63 | |
| 64 | =cut |
| 65 | |
| 66 | |
| 67 | require HTML::Parser; |
| 68 | @ISA = qw(HTML::Parser); |
| 69 | |
| 70 | use HTML::Entities (); |
| 71 | |
| 72 | use strict; |
| 73 | use vars qw($VERSION $DEBUG); |
| 74 | #$DEBUG = 1; |
| 75 | $VERSION = sprintf("%d.%02d", q$Revision: 2.17 $ =~ /(\d+)\.(\d+)/); |
| 76 | |
| 77 | =item $hp = HTML::HeadParser->new( [$header] ) |
| 78 | |
| 79 | The object constructor. The optional $header argument should be a |
| 80 | reference to an object that implement the header() and push_header() |
| 81 | methods as defined by the I<HTTP::Headers> class. Normally it will be |
| 82 | of some class that isa or delegates to the I<HTTP::Headers> class. |
| 83 | |
| 84 | If no $header is given I<HTML::HeadParser> will create an |
| 85 | I<HTTP::Header> object by itself (initially empty). |
| 86 | |
| 87 | =cut |
| 88 | |
| 89 | sub new |
| 90 | { |
| 91 | my($class, $header) = @_; |
| 92 | unless ($header) { |
| 93 | require HTTP::Headers; |
| 94 | $header = HTTP::Headers->new; |
| 95 | } |
| 96 | |
| 97 | my $self = $class->SUPER::new(api_version => 2, |
| 98 | ignore_elements => [qw(script style)], |
| 99 | ); |
| 100 | $self->{'header'} = $header; |
| 101 | $self->{'tag'} = ''; # name of active element that takes textual content |
| 102 | $self->{'text'} = ''; # the accumulated text associated with the element |
| 103 | $self; |
| 104 | } |
| 105 | |
| 106 | =item $hp->header; |
| 107 | |
| 108 | Returns a reference to the header object. |
| 109 | |
| 110 | =item $hp->header( $key ) |
| 111 | |
| 112 | Returns a header value. It is just a shorter way to write |
| 113 | C<$hp-E<gt>header-E<gt>header($key)>. |
| 114 | |
| 115 | =cut |
| 116 | |
| 117 | sub header |
| 118 | { |
| 119 | my $self = shift; |
| 120 | return $self->{'header'} unless @_; |
| 121 | $self->{'header'}->header(@_); |
| 122 | } |
| 123 | |
| 124 | sub as_string # legacy |
| 125 | { |
| 126 | my $self = shift; |
| 127 | $self->{'header'}->as_string; |
| 128 | } |
| 129 | |
| 130 | sub flush_text # internal |
| 131 | { |
| 132 | my $self = shift; |
| 133 | my $tag = $self->{'tag'}; |
| 134 | my $text = $self->{'text'}; |
| 135 | $text =~ s/^\s+//; |
| 136 | $text =~ s/\s+$//; |
| 137 | $text =~ s/\s+/ /g; |
| 138 | print "FLUSH $tag => '$text'\n" if $DEBUG; |
| 139 | if ($tag eq 'title') { |
| 140 | HTML::Entities::decode($text); |
| 141 | $self->{'header'}->header(Title => $text); |
| 142 | } |
| 143 | $self->{'tag'} = $self->{'text'} = ''; |
| 144 | } |
| 145 | |
| 146 | # This is an quote from the HTML3.2 DTD which shows which elements |
| 147 | # that might be present in a <HEAD>...</HEAD>. Also note that the |
| 148 | # <HEAD> tags themselves might be missing: |
| 149 | # |
| 150 | # <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? & |
| 151 | # SCRIPT* & META* & LINK*"> |
| 152 | # |
| 153 | # <!ELEMENT HEAD O O (%head.content)> |
| 154 | |
| 155 | |
| 156 | sub start |
| 157 | { |
| 158 | my($self, $tag, $attr) = @_; # $attr is reference to a HASH |
| 159 | print "START[$tag]\n" if $DEBUG; |
| 160 | $self->flush_text if $self->{'tag'}; |
| 161 | if ($tag eq 'meta') { |
| 162 | my $key = $attr->{'http-equiv'}; |
| 163 | if (!defined($key) || !length($key)) { |
| 164 | return unless $attr->{'name'}; |
| 165 | $key = "X-Meta-\u$attr->{'name'}"; |
| 166 | } |
| 167 | $self->{'header'}->push_header($key => $attr->{content}); |
| 168 | } elsif ($tag eq 'base') { |
| 169 | return unless exists $attr->{href}; |
| 170 | $self->{'header'}->header('Content-Base' => $attr->{href}); |
| 171 | } elsif ($tag eq 'isindex') { |
| 172 | # This is a non-standard header. Perhaps we should just ignore |
| 173 | # this element |
| 174 | $self->{'header'}->header(Isindex => $attr->{prompt} || '?'); |
| 175 | } elsif ($tag =~ /^(?:title|script|style)$/) { |
| 176 | # Just remember tag. Initialize header when we see the end tag. |
| 177 | $self->{'tag'} = $tag; |
| 178 | } elsif ($tag eq 'link') { |
| 179 | return unless exists $attr->{href}; |
| 180 | # <link href="http:..." rel="xxx" rev="xxx" title="xxx"> |
| 181 | my $h_val = "<" . delete($attr->{href}) . ">"; |
| 182 | for (sort keys %{$attr}) { |
| 183 | $h_val .= qq(; $_="$attr->{$_}"); |
| 184 | } |
| 185 | $self->{'header'}->push_header(Link => $h_val); |
| 186 | } elsif ($tag eq 'head' || $tag eq 'html') { |
| 187 | # ignore |
| 188 | } else { |
| 189 | # stop parsing |
| 190 | $self->eof; |
| 191 | } |
| 192 | } |
| 193 | |
| 194 | sub end |
| 195 | { |
| 196 | my($self, $tag) = @_; |
| 197 | print "END[$tag]\n" if $DEBUG; |
| 198 | $self->flush_text if $self->{'tag'}; |
| 199 | $self->eof if $tag eq 'head'; |
| 200 | } |
| 201 | |
| 202 | sub text |
| 203 | { |
| 204 | my($self, $text) = @_; |
| 205 | print "TEXT[$text]\n" if $DEBUG; |
| 206 | my $tag = $self->{tag}; |
| 207 | if (!$tag && $text =~ /\S/) { |
| 208 | # Normal text means start of body |
| 209 | $self->eof; |
| 210 | return; |
| 211 | } |
| 212 | return if $tag ne 'title'; |
| 213 | $self->{'text'} .= $text; |
| 214 | } |
| 215 | |
| 216 | 1; |
| 217 | |
| 218 | __END__ |
| 219 | |
| 220 | =head1 EXAMPLE |
| 221 | |
| 222 | $h = HTTP::Headers->new; |
| 223 | $p = HTML::HeadParser->new($h); |
| 224 | $p->parse(<<EOT); |
| 225 | <title>Stupid example</title> |
| 226 | <base href="http://www.linpro.no/lwp/"> |
| 227 | Normal text starts here. |
| 228 | EOT |
| 229 | undef $p; |
| 230 | print $h->title; # should print "Stupid example" |
| 231 | |
| 232 | =head1 SEE ALSO |
| 233 | |
| 234 | L<HTML::Parser>, L<HTTP::Headers> |
| 235 | |
| 236 | The I<HTTP::Headers> class is distributed as part of the I<libwww-perl> |
| 237 | package. |
| 238 | |
| 239 | =head1 COPYRIGHT |
| 240 | |
| 241 | Copyright 1996-2001 Gisle Aas. All rights reserved. |
| 242 | |
| 243 | This library is free software; you can redistribute it and/or |
| 244 | modify it under the same terms as Perl itself. |
| 245 | |
| 246 | =cut |
| 247 | |