| 1 | package HTML::TokeParser; |
| 2 | |
| 3 | # $Id: TokeParser.pm,v 2.24 2001/03/26 07:32:17 gisle Exp $ |
| 4 | |
| 5 | require HTML::PullParser; |
| 6 | @ISA=qw(HTML::PullParser); |
| 7 | $VERSION = sprintf("%d.%02d", q$Revision: 2.24 $ =~ /(\d+)\.(\d+)/); |
| 8 | |
| 9 | use strict; |
| 10 | use Carp (); |
| 11 | use HTML::Entities qw(decode_entities); |
| 12 | |
| 13 | my %ARGS = |
| 14 | ( |
| 15 | start => "'S',tagname,attr,attrseq,text", |
| 16 | end => "'E',tagname,text", |
| 17 | text => "'T',text,is_cdata", |
| 18 | process => "'PI',token0,text", |
| 19 | comment => "'C',text", |
| 20 | declaration => "'D',text", |
| 21 | ); |
| 22 | |
| 23 | |
| 24 | sub new |
| 25 | { |
| 26 | my $class = shift; |
| 27 | my %cnf; |
| 28 | if (@_ == 1) { |
| 29 | my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file"; |
| 30 | %cnf = ($type => $_[0]); |
| 31 | } |
| 32 | else { |
| 33 | %cnf = @_; |
| 34 | } |
| 35 | |
| 36 | my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"}; |
| 37 | |
| 38 | my $self = $class->SUPER::new(%cnf, %ARGS) || return undef; |
| 39 | |
| 40 | $self->{textify} = $textify; |
| 41 | $self; |
| 42 | } |
| 43 | |
| 44 | |
| 45 | sub get_tag |
| 46 | { |
| 47 | my $self = shift; |
| 48 | my $token; |
| 49 | while (1) { |
| 50 | $token = $self->get_token || return undef; |
| 51 | my $type = shift @$token; |
| 52 | next unless $type eq "S" || $type eq "E"; |
| 53 | substr($token->[0], 0, 0) = "/" if $type eq "E"; |
| 54 | return $token unless @_; |
| 55 | for (@_) { |
| 56 | return $token if $token->[0] eq $_; |
| 57 | } |
| 58 | } |
| 59 | } |
| 60 | |
| 61 | |
| 62 | sub get_text |
| 63 | { |
| 64 | my $self = shift; |
| 65 | my $endat = shift; |
| 66 | my @text; |
| 67 | while (my $token = $self->get_token) { |
| 68 | my $type = $token->[0]; |
| 69 | if ($type eq "T") { |
| 70 | my $text = $token->[1]; |
| 71 | decode_entities($text) unless $token->[2]; |
| 72 | push(@text, $text); |
| 73 | } elsif ($type =~ /^[SE]$/) { |
| 74 | my $tag = $token->[1]; |
| 75 | if ($type eq "S") { |
| 76 | if (exists $self->{textify}{$tag}) { |
| 77 | my $alt = $self->{textify}{$tag}; |
| 78 | my $text; |
| 79 | if (ref($alt)) { |
| 80 | $text = &$alt(@$token); |
| 81 | } else { |
| 82 | $text = $token->[2]{$alt || "alt"}; |
| 83 | $text = "[\U$tag]" unless defined $text; |
| 84 | } |
| 85 | push(@text, $text); |
| 86 | next; |
| 87 | } |
| 88 | } else { |
| 89 | $tag = "/$tag"; |
| 90 | } |
| 91 | if (!defined($endat) || $endat eq $tag) { |
| 92 | $self->unget_token($token); |
| 93 | last; |
| 94 | } |
| 95 | } |
| 96 | } |
| 97 | join("", @text); |
| 98 | } |
| 99 | |
| 100 | |
| 101 | sub get_trimmed_text |
| 102 | { |
| 103 | my $self = shift; |
| 104 | my $text = $self->get_text(@_); |
| 105 | $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; |
| 106 | $text; |
| 107 | } |
| 108 | |
| 109 | 1; |
| 110 | |
| 111 | |
| 112 | __END__ |
| 113 | |
| 114 | =head1 NAME |
| 115 | |
| 116 | HTML::TokeParser - Alternative HTML::Parser interface |
| 117 | |
| 118 | =head1 SYNOPSIS |
| 119 | |
| 120 | require HTML::TokeParser; |
| 121 | $p = HTML::TokeParser->new("index.html") || die "Can't open: $!"; |
| 122 | while (my $token = $p->get_token) { |
| 123 | #... |
| 124 | } |
| 125 | |
| 126 | =head1 DESCRIPTION |
| 127 | |
| 128 | The C<HTML::TokeParser> is an alternative interface to the |
| 129 | C<HTML::Parser> class. It is an C<HTML::PullParser> subclass. |
| 130 | |
| 131 | The following methods are available: |
| 132 | |
| 133 | =over 4 |
| 134 | |
| 135 | =item $p = HTML::TokeParser->new( $file_or_doc ); |
| 136 | |
| 137 | The object constructor argument is either a file name, a file handle |
| 138 | object, or the complete document to be parsed. |
| 139 | |
| 140 | If the argument is a plain scalar, then it is taken as the name of a |
| 141 | file to be opened and parsed. If the file can't be opened for |
| 142 | reading, then the constructor will return an undefined value and $! |
| 143 | will tell you why it failed. |
| 144 | |
| 145 | If the argument is a reference to a plain scalar, then this scalar is |
| 146 | taken to be the literal document to parse. The value of this |
| 147 | scalar should not be changed before all tokens have been extracted. |
| 148 | |
| 149 | Otherwise the argument is taken to be some object that the |
| 150 | C<HTML::TokeParser> can read() from when it needs more data. Typically |
| 151 | it will be a filehandle of some kind. The stream will be read() until |
| 152 | EOF, but not closed. |
| 153 | |
| 154 | =item $p->get_token |
| 155 | |
| 156 | This method will return the next I<token> found in the HTML document, |
| 157 | or C<undef> at the end of the document. The token is returned as an |
| 158 | array reference. The first element of the array will be a (mostly) |
| 159 | single character string denoting the type of this token: "S" for start |
| 160 | tag, "E" for end tag, "T" for text, "C" for comment, "D" for |
| 161 | declaration, and "PI" for process instructions. The rest of the array |
| 162 | is the same as the arguments passed to the corresponding HTML::Parser |
| 163 | v2 compatible callbacks (see L<HTML::Parser>). In summary, returned |
| 164 | tokens look like this: |
| 165 | |
| 166 | ["S", $tag, $attr, $attrseq, $text] |
| 167 | ["E", $tag, $text] |
| 168 | ["T", $text, $is_data] |
| 169 | ["C", $text] |
| 170 | ["D", $text] |
| 171 | ["PI", $token0, $text] |
| 172 | |
| 173 | where $attr is a hash reference, $attrseq is an array reference and |
| 174 | the rest is plain scalars. |
| 175 | |
| 176 | =item $p->unget_token($token,...) |
| 177 | |
| 178 | If you find out you have read too many tokens you can push them back, |
| 179 | so that they are returned the next time $p->get_token is called. |
| 180 | |
| 181 | =item $p->get_tag( [$tag, ...] ) |
| 182 | |
| 183 | This method returns the next start or end tag (skipping any other |
| 184 | tokens), or C<undef> if there are no more tags in the document. If |
| 185 | one or more arguments are given, then we skip tokens until one of the |
| 186 | specified tag types is found. For example: |
| 187 | |
| 188 | $p->get_tag("font", "/font"); |
| 189 | |
| 190 | will find the next start or end tag for a font-element. |
| 191 | |
| 192 | The tag information is returned as an array reference in the same form |
| 193 | as for $p->get_token above, but the type code (first element) is |
| 194 | missing. A start tag will be returned like this: |
| 195 | |
| 196 | [$tag, $attr, $attrseq, $text] |
| 197 | |
| 198 | The tagname of end tags are prefixed with "/", i.e. end tag is |
| 199 | returned like this: |
| 200 | |
| 201 | ["/$tag", $text] |
| 202 | |
| 203 | =item $p->get_text( [$endtag] ) |
| 204 | |
| 205 | This method returns all text found at the current position. It will |
| 206 | return a zero length string if the next token is not text. The |
| 207 | optional $endtag argument specifies that any text occurring before the |
| 208 | given tag is to be returned. Any entities will be converted to their |
| 209 | corresponding character. |
| 210 | |
| 211 | The $p->{textify} attribute is a hash that defines how certain tags can |
| 212 | be treated as text. If the name of a start tag matches a key in this |
| 213 | hash then this tag is converted to text. The hash value is used to |
| 214 | specify which tag attribute to obtain the text from. If this tag |
| 215 | attribute is missing, then the upper case name of the tag enclosed in |
| 216 | brackets is returned, e.g. "[IMG]". The hash value can also be a |
| 217 | subroutine reference. In this case the routine is called with the |
| 218 | start tag token content as its argument and the return value is treated |
| 219 | as the text. |
| 220 | |
| 221 | The default $p->{textify} value is: |
| 222 | |
| 223 | {img => "alt", applet => "alt"} |
| 224 | |
| 225 | This means that <IMG> and <APPLET> tags are treated as text, and that |
| 226 | the text to substitute can be found in the ALT attribute. |
| 227 | |
| 228 | =item $p->get_trimmed_text( [$endtag] ) |
| 229 | |
| 230 | Same as $p->get_text above, but will collapse any sequences of white |
| 231 | space to a single space character. Leading and trailing white space is |
| 232 | removed. |
| 233 | |
| 234 | =back |
| 235 | |
| 236 | =head1 EXAMPLES |
| 237 | |
| 238 | This example extracts all links from a document. It will print one |
| 239 | line for each link, containing the URL and the textual description |
| 240 | between the <A>...</A> tags: |
| 241 | |
| 242 | use HTML::TokeParser; |
| 243 | $p = HTML::TokeParser->new(shift||"index.html"); |
| 244 | |
| 245 | while (my $token = $p->get_tag("a")) { |
| 246 | my $url = $token->[1]{href} || "-"; |
| 247 | my $text = $p->get_trimmed_text("/a"); |
| 248 | print "$url\t$text\n"; |
| 249 | } |
| 250 | |
| 251 | This example extract the <TITLE> from the document: |
| 252 | |
| 253 | use HTML::TokeParser; |
| 254 | $p = HTML::TokeParser->new(shift||"index.html"); |
| 255 | if ($p->get_tag("title")) { |
| 256 | my $title = $p->get_trimmed_text; |
| 257 | print "Title: $title\n"; |
| 258 | } |
| 259 | |
| 260 | =head1 SEE ALSO |
| 261 | |
| 262 | L<HTML::PullParser>, L<HTML::Parser> |
| 263 | |
| 264 | =head1 COPYRIGHT |
| 265 | |
| 266 | Copyright 1998-2001 Gisle Aas. |
| 267 | |
| 268 | This library is free software; you can redistribute it and/or |
| 269 | modify it under the same terms as Perl itself. |
| 270 | |
| 271 | =cut |