Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | package HTML::HeadParser; |
2 | ||
3 | =head1 NAME | |
4 | ||
5 | HTML::HeadParser - Parse <HEAD> section of a HTML document | |
6 | ||
7 | =head1 SYNOPSIS | |
8 | ||
9 | require HTML::HeadParser; | |
10 | $p = HTML::HeadParser->new; | |
11 | $p->parse($text) and print "not finished"; | |
12 | ||
13 | $p->header('Title') # to access <title>....</title> | |
14 | $p->header('Content-Base') # to access <base href="http://..."> | |
15 | $p->header('Foo') # to access <meta http-equiv="Foo" content="..."> | |
16 | ||
17 | =head1 DESCRIPTION | |
18 | ||
19 | The I<HTML::HeadParser> is a specialized (and lightweight) | |
20 | I<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD> | |
21 | section of an HTML document. The parse() method | |
22 | will return a FALSE value as soon as some E<lt>BODY> element or body | |
23 | text are found, and should not be called again after this. | |
24 | ||
25 | The I<HTML::HeadParser> keeps a reference to a header object, and the | |
26 | parser will update this header object as the various elements of the | |
27 | E<lt>HEAD> section of the HTML document are recognized. The following | |
28 | header fields are affected: | |
29 | ||
30 | =over 4 | |
31 | ||
32 | =item Content-Base: | |
33 | ||
34 | The I<Content-Base> header is initialized from the E<lt>base | |
35 | href="..."> element. | |
36 | ||
37 | =item Title: | |
38 | ||
39 | The I<Title> header is initialized from the E<lt>title>...E<lt>/title> | |
40 | element. | |
41 | ||
42 | =item Isindex: | |
43 | ||
44 | The I<Isindex> header will be added if there is a E<lt>isindex> | |
45 | element in the E<lt>head>. The header value is initialized from the | |
46 | I<prompt> attribute if it is present. If no I<prompt> attribute is | |
47 | given it will have '?' as the value. | |
48 | ||
49 | =item X-Meta-Foo: | |
50 | ||
51 | All E<lt>meta> elements will initialize headers with the prefix | |
52 | "C<X-Meta->" on the name. If the E<lt>meta> element contains a | |
53 | C<http-equiv> attribute, then it will be honored as the header name. | |
54 | ||
55 | =back | |
56 | ||
57 | =head1 METHODS | |
58 | ||
59 | The following methods (in addition to those provided by the | |
60 | superclass) are available: | |
61 | ||
62 | =over 4 | |
63 | ||
64 | =cut | |
65 | ||
66 | ||
67 | require HTML::Parser; | |
68 | @ISA = qw(HTML::Parser); | |
69 | ||
70 | use HTML::Entities (); | |
71 | ||
72 | use strict; | |
73 | use vars qw($VERSION $DEBUG); | |
74 | #$DEBUG = 1; | |
75 | $VERSION = sprintf("%d.%02d", q$Revision: 2.17 $ =~ /(\d+)\.(\d+)/); | |
76 | ||
77 | =item $hp = HTML::HeadParser->new( [$header] ) | |
78 | ||
79 | The object constructor. The optional $header argument should be a | |
80 | reference to an object that implement the header() and push_header() | |
81 | methods as defined by the I<HTTP::Headers> class. Normally it will be | |
82 | of some class that isa or delegates to the I<HTTP::Headers> class. | |
83 | ||
84 | If no $header is given I<HTML::HeadParser> will create an | |
85 | I<HTTP::Header> object by itself (initially empty). | |
86 | ||
87 | =cut | |
88 | ||
89 | sub new | |
90 | { | |
91 | my($class, $header) = @_; | |
92 | unless ($header) { | |
93 | require HTTP::Headers; | |
94 | $header = HTTP::Headers->new; | |
95 | } | |
96 | ||
97 | my $self = $class->SUPER::new(api_version => 2, | |
98 | ignore_elements => [qw(script style)], | |
99 | ); | |
100 | $self->{'header'} = $header; | |
101 | $self->{'tag'} = ''; # name of active element that takes textual content | |
102 | $self->{'text'} = ''; # the accumulated text associated with the element | |
103 | $self; | |
104 | } | |
105 | ||
106 | =item $hp->header; | |
107 | ||
108 | Returns a reference to the header object. | |
109 | ||
110 | =item $hp->header( $key ) | |
111 | ||
112 | Returns a header value. It is just a shorter way to write | |
113 | C<$hp-E<gt>header-E<gt>header($key)>. | |
114 | ||
115 | =cut | |
116 | ||
117 | sub header | |
118 | { | |
119 | my $self = shift; | |
120 | return $self->{'header'} unless @_; | |
121 | $self->{'header'}->header(@_); | |
122 | } | |
123 | ||
124 | sub as_string # legacy | |
125 | { | |
126 | my $self = shift; | |
127 | $self->{'header'}->as_string; | |
128 | } | |
129 | ||
130 | sub flush_text # internal | |
131 | { | |
132 | my $self = shift; | |
133 | my $tag = $self->{'tag'}; | |
134 | my $text = $self->{'text'}; | |
135 | $text =~ s/^\s+//; | |
136 | $text =~ s/\s+$//; | |
137 | $text =~ s/\s+/ /g; | |
138 | print "FLUSH $tag => '$text'\n" if $DEBUG; | |
139 | if ($tag eq 'title') { | |
140 | HTML::Entities::decode($text); | |
141 | $self->{'header'}->header(Title => $text); | |
142 | } | |
143 | $self->{'tag'} = $self->{'text'} = ''; | |
144 | } | |
145 | ||
146 | # This is an quote from the HTML3.2 DTD which shows which elements | |
147 | # that might be present in a <HEAD>...</HEAD>. Also note that the | |
148 | # <HEAD> tags themselves might be missing: | |
149 | # | |
150 | # <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? & | |
151 | # SCRIPT* & META* & LINK*"> | |
152 | # | |
153 | # <!ELEMENT HEAD O O (%head.content)> | |
154 | ||
155 | ||
156 | sub start | |
157 | { | |
158 | my($self, $tag, $attr) = @_; # $attr is reference to a HASH | |
159 | print "START[$tag]\n" if $DEBUG; | |
160 | $self->flush_text if $self->{'tag'}; | |
161 | if ($tag eq 'meta') { | |
162 | my $key = $attr->{'http-equiv'}; | |
163 | if (!defined($key) || !length($key)) { | |
164 | return unless $attr->{'name'}; | |
165 | $key = "X-Meta-\u$attr->{'name'}"; | |
166 | } | |
167 | $self->{'header'}->push_header($key => $attr->{content}); | |
168 | } elsif ($tag eq 'base') { | |
169 | return unless exists $attr->{href}; | |
170 | $self->{'header'}->header('Content-Base' => $attr->{href}); | |
171 | } elsif ($tag eq 'isindex') { | |
172 | # This is a non-standard header. Perhaps we should just ignore | |
173 | # this element | |
174 | $self->{'header'}->header(Isindex => $attr->{prompt} || '?'); | |
175 | } elsif ($tag =~ /^(?:title|script|style)$/) { | |
176 | # Just remember tag. Initialize header when we see the end tag. | |
177 | $self->{'tag'} = $tag; | |
178 | } elsif ($tag eq 'link') { | |
179 | return unless exists $attr->{href}; | |
180 | # <link href="http:..." rel="xxx" rev="xxx" title="xxx"> | |
181 | my $h_val = "<" . delete($attr->{href}) . ">"; | |
182 | for (sort keys %{$attr}) { | |
183 | $h_val .= qq(; $_="$attr->{$_}"); | |
184 | } | |
185 | $self->{'header'}->push_header(Link => $h_val); | |
186 | } elsif ($tag eq 'head' || $tag eq 'html') { | |
187 | # ignore | |
188 | } else { | |
189 | # stop parsing | |
190 | $self->eof; | |
191 | } | |
192 | } | |
193 | ||
194 | sub end | |
195 | { | |
196 | my($self, $tag) = @_; | |
197 | print "END[$tag]\n" if $DEBUG; | |
198 | $self->flush_text if $self->{'tag'}; | |
199 | $self->eof if $tag eq 'head'; | |
200 | } | |
201 | ||
202 | sub text | |
203 | { | |
204 | my($self, $text) = @_; | |
205 | print "TEXT[$text]\n" if $DEBUG; | |
206 | my $tag = $self->{tag}; | |
207 | if (!$tag && $text =~ /\S/) { | |
208 | # Normal text means start of body | |
209 | $self->eof; | |
210 | return; | |
211 | } | |
212 | return if $tag ne 'title'; | |
213 | $self->{'text'} .= $text; | |
214 | } | |
215 | ||
216 | 1; | |
217 | ||
218 | __END__ | |
219 | ||
220 | =head1 EXAMPLE | |
221 | ||
222 | $h = HTTP::Headers->new; | |
223 | $p = HTML::HeadParser->new($h); | |
224 | $p->parse(<<EOT); | |
225 | <title>Stupid example</title> | |
226 | <base href="http://www.linpro.no/lwp/"> | |
227 | Normal text starts here. | |
228 | EOT | |
229 | undef $p; | |
230 | print $h->title; # should print "Stupid example" | |
231 | ||
232 | =head1 SEE ALSO | |
233 | ||
234 | L<HTML::Parser>, L<HTTP::Headers> | |
235 | ||
236 | The I<HTTP::Headers> class is distributed as part of the I<libwww-perl> | |
237 | package. | |
238 | ||
239 | =head1 COPYRIGHT | |
240 | ||
241 | Copyright 1996-2001 Gisle Aas. All rights reserved. | |
242 | ||
243 | This library is free software; you can redistribute it and/or | |
244 | modify it under the same terms as Perl itself. | |
245 | ||
246 | =cut | |
247 |