Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | package HTML::TokeParser; |
2 | ||
3 | # $Id: TokeParser.pm,v 2.24 2001/03/26 07:32:17 gisle Exp $ | |
4 | ||
5 | require HTML::PullParser; | |
6 | @ISA=qw(HTML::PullParser); | |
7 | $VERSION = sprintf("%d.%02d", q$Revision: 2.24 $ =~ /(\d+)\.(\d+)/); | |
8 | ||
9 | use strict; | |
10 | use Carp (); | |
11 | use HTML::Entities qw(decode_entities); | |
12 | ||
13 | my %ARGS = | |
14 | ( | |
15 | start => "'S',tagname,attr,attrseq,text", | |
16 | end => "'E',tagname,text", | |
17 | text => "'T',text,is_cdata", | |
18 | process => "'PI',token0,text", | |
19 | comment => "'C',text", | |
20 | declaration => "'D',text", | |
21 | ); | |
22 | ||
23 | ||
24 | sub new | |
25 | { | |
26 | my $class = shift; | |
27 | my %cnf; | |
28 | if (@_ == 1) { | |
29 | my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file"; | |
30 | %cnf = ($type => $_[0]); | |
31 | } | |
32 | else { | |
33 | %cnf = @_; | |
34 | } | |
35 | ||
36 | my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"}; | |
37 | ||
38 | my $self = $class->SUPER::new(%cnf, %ARGS) || return undef; | |
39 | ||
40 | $self->{textify} = $textify; | |
41 | $self; | |
42 | } | |
43 | ||
44 | ||
45 | sub get_tag | |
46 | { | |
47 | my $self = shift; | |
48 | my $token; | |
49 | while (1) { | |
50 | $token = $self->get_token || return undef; | |
51 | my $type = shift @$token; | |
52 | next unless $type eq "S" || $type eq "E"; | |
53 | substr($token->[0], 0, 0) = "/" if $type eq "E"; | |
54 | return $token unless @_; | |
55 | for (@_) { | |
56 | return $token if $token->[0] eq $_; | |
57 | } | |
58 | } | |
59 | } | |
60 | ||
61 | ||
62 | sub get_text | |
63 | { | |
64 | my $self = shift; | |
65 | my $endat = shift; | |
66 | my @text; | |
67 | while (my $token = $self->get_token) { | |
68 | my $type = $token->[0]; | |
69 | if ($type eq "T") { | |
70 | my $text = $token->[1]; | |
71 | decode_entities($text) unless $token->[2]; | |
72 | push(@text, $text); | |
73 | } elsif ($type =~ /^[SE]$/) { | |
74 | my $tag = $token->[1]; | |
75 | if ($type eq "S") { | |
76 | if (exists $self->{textify}{$tag}) { | |
77 | my $alt = $self->{textify}{$tag}; | |
78 | my $text; | |
79 | if (ref($alt)) { | |
80 | $text = &$alt(@$token); | |
81 | } else { | |
82 | $text = $token->[2]{$alt || "alt"}; | |
83 | $text = "[\U$tag]" unless defined $text; | |
84 | } | |
85 | push(@text, $text); | |
86 | next; | |
87 | } | |
88 | } else { | |
89 | $tag = "/$tag"; | |
90 | } | |
91 | if (!defined($endat) || $endat eq $tag) { | |
92 | $self->unget_token($token); | |
93 | last; | |
94 | } | |
95 | } | |
96 | } | |
97 | join("", @text); | |
98 | } | |
99 | ||
100 | ||
101 | sub get_trimmed_text | |
102 | { | |
103 | my $self = shift; | |
104 | my $text = $self->get_text(@_); | |
105 | $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; | |
106 | $text; | |
107 | } | |
108 | ||
109 | 1; | |
110 | ||
111 | ||
112 | __END__ | |
113 | ||
114 | =head1 NAME | |
115 | ||
116 | HTML::TokeParser - Alternative HTML::Parser interface | |
117 | ||
118 | =head1 SYNOPSIS | |
119 | ||
120 | require HTML::TokeParser; | |
121 | $p = HTML::TokeParser->new("index.html") || die "Can't open: $!"; | |
122 | while (my $token = $p->get_token) { | |
123 | #... | |
124 | } | |
125 | ||
126 | =head1 DESCRIPTION | |
127 | ||
128 | The C<HTML::TokeParser> is an alternative interface to the | |
129 | C<HTML::Parser> class. It is an C<HTML::PullParser> subclass. | |
130 | ||
131 | The following methods are available: | |
132 | ||
133 | =over 4 | |
134 | ||
135 | =item $p = HTML::TokeParser->new( $file_or_doc ); | |
136 | ||
137 | The object constructor argument is either a file name, a file handle | |
138 | object, or the complete document to be parsed. | |
139 | ||
140 | If the argument is a plain scalar, then it is taken as the name of a | |
141 | file to be opened and parsed. If the file can't be opened for | |
142 | reading, then the constructor will return an undefined value and $! | |
143 | will tell you why it failed. | |
144 | ||
145 | If the argument is a reference to a plain scalar, then this scalar is | |
146 | taken to be the literal document to parse. The value of this | |
147 | scalar should not be changed before all tokens have been extracted. | |
148 | ||
149 | Otherwise the argument is taken to be some object that the | |
150 | C<HTML::TokeParser> can read() from when it needs more data. Typically | |
151 | it will be a filehandle of some kind. The stream will be read() until | |
152 | EOF, but not closed. | |
153 | ||
154 | =item $p->get_token | |
155 | ||
156 | This method will return the next I<token> found in the HTML document, | |
157 | or C<undef> at the end of the document. The token is returned as an | |
158 | array reference. The first element of the array will be a (mostly) | |
159 | single character string denoting the type of this token: "S" for start | |
160 | tag, "E" for end tag, "T" for text, "C" for comment, "D" for | |
161 | declaration, and "PI" for process instructions. The rest of the array | |
162 | is the same as the arguments passed to the corresponding HTML::Parser | |
163 | v2 compatible callbacks (see L<HTML::Parser>). In summary, returned | |
164 | tokens look like this: | |
165 | ||
166 | ["S", $tag, $attr, $attrseq, $text] | |
167 | ["E", $tag, $text] | |
168 | ["T", $text, $is_data] | |
169 | ["C", $text] | |
170 | ["D", $text] | |
171 | ["PI", $token0, $text] | |
172 | ||
173 | where $attr is a hash reference, $attrseq is an array reference and | |
174 | the rest is plain scalars. | |
175 | ||
176 | =item $p->unget_token($token,...) | |
177 | ||
178 | If you find out you have read too many tokens you can push them back, | |
179 | so that they are returned the next time $p->get_token is called. | |
180 | ||
181 | =item $p->get_tag( [$tag, ...] ) | |
182 | ||
183 | This method returns the next start or end tag (skipping any other | |
184 | tokens), or C<undef> if there are no more tags in the document. If | |
185 | one or more arguments are given, then we skip tokens until one of the | |
186 | specified tag types is found. For example: | |
187 | ||
188 | $p->get_tag("font", "/font"); | |
189 | ||
190 | will find the next start or end tag for a font-element. | |
191 | ||
192 | The tag information is returned as an array reference in the same form | |
193 | as for $p->get_token above, but the type code (first element) is | |
194 | missing. A start tag will be returned like this: | |
195 | ||
196 | [$tag, $attr, $attrseq, $text] | |
197 | ||
198 | The tagname of end tags are prefixed with "/", i.e. end tag is | |
199 | returned like this: | |
200 | ||
201 | ["/$tag", $text] | |
202 | ||
203 | =item $p->get_text( [$endtag] ) | |
204 | ||
205 | This method returns all text found at the current position. It will | |
206 | return a zero length string if the next token is not text. The | |
207 | optional $endtag argument specifies that any text occurring before the | |
208 | given tag is to be returned. Any entities will be converted to their | |
209 | corresponding character. | |
210 | ||
211 | The $p->{textify} attribute is a hash that defines how certain tags can | |
212 | be treated as text. If the name of a start tag matches a key in this | |
213 | hash then this tag is converted to text. The hash value is used to | |
214 | specify which tag attribute to obtain the text from. If this tag | |
215 | attribute is missing, then the upper case name of the tag enclosed in | |
216 | brackets is returned, e.g. "[IMG]". The hash value can also be a | |
217 | subroutine reference. In this case the routine is called with the | |
218 | start tag token content as its argument and the return value is treated | |
219 | as the text. | |
220 | ||
221 | The default $p->{textify} value is: | |
222 | ||
223 | {img => "alt", applet => "alt"} | |
224 | ||
225 | This means that <IMG> and <APPLET> tags are treated as text, and that | |
226 | the text to substitute can be found in the ALT attribute. | |
227 | ||
228 | =item $p->get_trimmed_text( [$endtag] ) | |
229 | ||
230 | Same as $p->get_text above, but will collapse any sequences of white | |
231 | space to a single space character. Leading and trailing white space is | |
232 | removed. | |
233 | ||
234 | =back | |
235 | ||
236 | =head1 EXAMPLES | |
237 | ||
238 | This example extracts all links from a document. It will print one | |
239 | line for each link, containing the URL and the textual description | |
240 | between the <A>...</A> tags: | |
241 | ||
242 | use HTML::TokeParser; | |
243 | $p = HTML::TokeParser->new(shift||"index.html"); | |
244 | ||
245 | while (my $token = $p->get_tag("a")) { | |
246 | my $url = $token->[1]{href} || "-"; | |
247 | my $text = $p->get_trimmed_text("/a"); | |
248 | print "$url\t$text\n"; | |
249 | } | |
250 | ||
251 | This example extract the <TITLE> from the document: | |
252 | ||
253 | use HTML::TokeParser; | |
254 | $p = HTML::TokeParser->new(shift||"index.html"); | |
255 | if ($p->get_tag("title")) { | |
256 | my $title = $p->get_trimmed_text; | |
257 | print "Title: $title\n"; | |
258 | } | |
259 | ||
260 | =head1 SEE ALSO | |
261 | ||
262 | L<HTML::PullParser>, L<HTML::Parser> | |
263 | ||
264 | =head1 COPYRIGHT | |
265 | ||
266 | Copyright 1998-2001 Gisle Aas. | |
267 | ||
268 | This library is free software; you can redistribute it and/or | |
269 | modify it under the same terms as Perl itself. | |
270 | ||
271 | =cut |