Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / site_perl / 5.8.0 / sun4-solaris / HTML / TokeParser.pm
CommitLineData
86530b38
AT
1package HTML::TokeParser;
2
3# $Id: TokeParser.pm,v 2.24 2001/03/26 07:32:17 gisle Exp $
4
5require HTML::PullParser;
6@ISA=qw(HTML::PullParser);
7$VERSION = sprintf("%d.%02d", q$Revision: 2.24 $ =~ /(\d+)\.(\d+)/);
8
9use strict;
10use Carp ();
11use HTML::Entities qw(decode_entities);
12
13my %ARGS =
14(
15 start => "'S',tagname,attr,attrseq,text",
16 end => "'E',tagname,text",
17 text => "'T',text,is_cdata",
18 process => "'PI',token0,text",
19 comment => "'C',text",
20 declaration => "'D',text",
21);
22
23
24sub new
25{
26 my $class = shift;
27 my %cnf;
28 if (@_ == 1) {
29 my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
30 %cnf = ($type => $_[0]);
31 }
32 else {
33 %cnf = @_;
34 }
35
36 my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
37
38 my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
39
40 $self->{textify} = $textify;
41 $self;
42}
43
44
45sub get_tag
46{
47 my $self = shift;
48 my $token;
49 while (1) {
50 $token = $self->get_token || return undef;
51 my $type = shift @$token;
52 next unless $type eq "S" || $type eq "E";
53 substr($token->[0], 0, 0) = "/" if $type eq "E";
54 return $token unless @_;
55 for (@_) {
56 return $token if $token->[0] eq $_;
57 }
58 }
59}
60
61
62sub get_text
63{
64 my $self = shift;
65 my $endat = shift;
66 my @text;
67 while (my $token = $self->get_token) {
68 my $type = $token->[0];
69 if ($type eq "T") {
70 my $text = $token->[1];
71 decode_entities($text) unless $token->[2];
72 push(@text, $text);
73 } elsif ($type =~ /^[SE]$/) {
74 my $tag = $token->[1];
75 if ($type eq "S") {
76 if (exists $self->{textify}{$tag}) {
77 my $alt = $self->{textify}{$tag};
78 my $text;
79 if (ref($alt)) {
80 $text = &$alt(@$token);
81 } else {
82 $text = $token->[2]{$alt || "alt"};
83 $text = "[\U$tag]" unless defined $text;
84 }
85 push(@text, $text);
86 next;
87 }
88 } else {
89 $tag = "/$tag";
90 }
91 if (!defined($endat) || $endat eq $tag) {
92 $self->unget_token($token);
93 last;
94 }
95 }
96 }
97 join("", @text);
98}
99
100
101sub get_trimmed_text
102{
103 my $self = shift;
104 my $text = $self->get_text(@_);
105 $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
106 $text;
107}
108
1091;
110
111
112__END__
113
114=head1 NAME
115
116HTML::TokeParser - Alternative HTML::Parser interface
117
118=head1 SYNOPSIS
119
120 require HTML::TokeParser;
121 $p = HTML::TokeParser->new("index.html") || die "Can't open: $!";
122 while (my $token = $p->get_token) {
123 #...
124 }
125
126=head1 DESCRIPTION
127
128The C<HTML::TokeParser> is an alternative interface to the
129C<HTML::Parser> class. It is an C<HTML::PullParser> subclass.
130
131The following methods are available:
132
133=over 4
134
135=item $p = HTML::TokeParser->new( $file_or_doc );
136
137The object constructor argument is either a file name, a file handle
138object, or the complete document to be parsed.
139
140If the argument is a plain scalar, then it is taken as the name of a
141file to be opened and parsed. If the file can't be opened for
142reading, then the constructor will return an undefined value and $!
143will tell you why it failed.
144
145If the argument is a reference to a plain scalar, then this scalar is
146taken to be the literal document to parse. The value of this
147scalar should not be changed before all tokens have been extracted.
148
149Otherwise the argument is taken to be some object that the
150C<HTML::TokeParser> can read() from when it needs more data. Typically
151it will be a filehandle of some kind. The stream will be read() until
152EOF, but not closed.
153
154=item $p->get_token
155
156This method will return the next I<token> found in the HTML document,
157or C<undef> at the end of the document. The token is returned as an
158array reference. The first element of the array will be a (mostly)
159single character string denoting the type of this token: "S" for start
160tag, "E" for end tag, "T" for text, "C" for comment, "D" for
161declaration, and "PI" for process instructions. The rest of the array
162is the same as the arguments passed to the corresponding HTML::Parser
163v2 compatible callbacks (see L<HTML::Parser>). In summary, returned
164tokens look like this:
165
166 ["S", $tag, $attr, $attrseq, $text]
167 ["E", $tag, $text]
168 ["T", $text, $is_data]
169 ["C", $text]
170 ["D", $text]
171 ["PI", $token0, $text]
172
173where $attr is a hash reference, $attrseq is an array reference and
174the rest is plain scalars.
175
176=item $p->unget_token($token,...)
177
178If you find out you have read too many tokens you can push them back,
179so that they are returned the next time $p->get_token is called.
180
181=item $p->get_tag( [$tag, ...] )
182
183This method returns the next start or end tag (skipping any other
184tokens), or C<undef> if there are no more tags in the document. If
185one or more arguments are given, then we skip tokens until one of the
186specified tag types is found. For example:
187
188 $p->get_tag("font", "/font");
189
190will find the next start or end tag for a font-element.
191
192The tag information is returned as an array reference in the same form
193as for $p->get_token above, but the type code (first element) is
194missing. A start tag will be returned like this:
195
196 [$tag, $attr, $attrseq, $text]
197
198The tagname of end tags are prefixed with "/", i.e. end tag is
199returned like this:
200
201 ["/$tag", $text]
202
203=item $p->get_text( [$endtag] )
204
205This method returns all text found at the current position. It will
206return a zero length string if the next token is not text. The
207optional $endtag argument specifies that any text occurring before the
208given tag is to be returned. Any entities will be converted to their
209corresponding character.
210
211The $p->{textify} attribute is a hash that defines how certain tags can
212be treated as text. If the name of a start tag matches a key in this
213hash then this tag is converted to text. The hash value is used to
214specify which tag attribute to obtain the text from. If this tag
215attribute is missing, then the upper case name of the tag enclosed in
216brackets is returned, e.g. "[IMG]". The hash value can also be a
217subroutine reference. In this case the routine is called with the
218start tag token content as its argument and the return value is treated
219as the text.
220
221The default $p->{textify} value is:
222
223 {img => "alt", applet => "alt"}
224
225This means that <IMG> and <APPLET> tags are treated as text, and that
226the text to substitute can be found in the ALT attribute.
227
228=item $p->get_trimmed_text( [$endtag] )
229
230Same as $p->get_text above, but will collapse any sequences of white
231space to a single space character. Leading and trailing white space is
232removed.
233
234=back
235
236=head1 EXAMPLES
237
238This example extracts all links from a document. It will print one
239line for each link, containing the URL and the textual description
240between the <A>...</A> tags:
241
242 use HTML::TokeParser;
243 $p = HTML::TokeParser->new(shift||"index.html");
244
245 while (my $token = $p->get_tag("a")) {
246 my $url = $token->[1]{href} || "-";
247 my $text = $p->get_trimmed_text("/a");
248 print "$url\t$text\n";
249 }
250
251This example extract the <TITLE> from the document:
252
253 use HTML::TokeParser;
254 $p = HTML::TokeParser->new(shift||"index.html");
255 if ($p->get_tag("title")) {
256 my $title = $p->get_trimmed_text;
257 print "Title: $title\n";
258 }
259
260=head1 SEE ALSO
261
262L<HTML::PullParser>, L<HTML::Parser>
263
264=head1 COPYRIGHT
265
266Copyright 1998-2001 Gisle Aas.
267
268This library is free software; you can redistribute it and/or
269modify it under the same terms as Perl itself.
270
271=cut