[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / site_perl / 5.8.0 / sun4-solaris / HTML / TokeParser.pm

package HTML::TokeParser;

# $Id: TokeParser.pm,v 2.24 2001/03/26 07:32:17 gisle Exp $

require HTML::PullParser;
@ISA=qw(HTML::PullParser);
$VERSION = sprintf("%d.%02d", q$Revision: 2.24 $ =~ /(\d+)\.(\d+)/);

use strict;
use Carp ();
use HTML::Entities qw(decode_entities);

my %ARGS =
(
 start       => "'S',tagname,attr,attrseq,text",
 end         => "'E',tagname,text",
 text        => "'T',text,is_cdata",
 process     => "'PI',token0,text",
 comment     => "'C',text",
 declaration => "'D',text",
);


sub new
{
    my $class = shift;
    my %cnf;
    if (@_ == 1) {
	my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
	%cnf = ($type => $_[0]);
    }
    else {
	%cnf = @_;
    }

    my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};

    my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;

    $self->{textify} = $textify;
    $self;
}


sub get_tag
{
    my $self = shift;
    my $token;
    while (1) {
	$token = $self->get_token || return undef;
	my $type = shift @$token;
	next unless $type eq "S" || $type eq "E";
	substr($token->[0], 0, 0) = "/" if $type eq "E";
	return $token unless @_;
	for (@_) {
	    return $token if $token->[0] eq $_;
	}
    }
}


sub get_text
{
    my $self = shift;
    my $endat = shift;
    my @text;
    while (my $token = $self->get_token) {
	my $type = $token->[0];
	if ($type eq "T") {
	    my $text = $token->[1];
	    decode_entities($text) unless $token->[2];
	    push(@text, $text);
	} elsif ($type =~ /^[SE]$/) {
	    my $tag = $token->[1];
	    if ($type eq "S") {
		if (exists $self->{textify}{$tag}) {
		    my $alt = $self->{textify}{$tag};
		    my $text;
		    if (ref($alt)) {
			$text = &$alt(@$token);
		    } else {
			$text = $token->[2]{$alt || "alt"};
			$text = "[\U$tag]" unless defined $text;
		    }
		    push(@text, $text);
		    next;
		}
	    } else {
		$tag = "/$tag";
	    }
	    if (!defined($endat) || $endat eq $tag) {
		 $self->unget_token($token);
		 last;
	    }
	}
    }
    join("", @text);
}


sub get_trimmed_text
{
    my $self = shift;
    my $text = $self->get_text(@_);
    $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
    $text;
}

1;


__END__

=head1 NAME

HTML::TokeParser - Alternative HTML::Parser interface

=head1 SYNOPSIS

 require HTML::TokeParser;
 $p = HTML::TokeParser->new("index.html") || die "Can't open: $!";
 while (my $token = $p->get_token) {
     #...
 }

=head1 DESCRIPTION

The C<HTML::TokeParser> is an alternative interface to the
C<HTML::Parser> class.  It is an C<HTML::PullParser> subclass.

The following methods are available:

=over 4

=item $p = HTML::TokeParser->new( $file_or_doc );

The object constructor argument is either a file name, a file handle
object, or the complete document to be parsed.

If the argument is a plain scalar, then it is taken as the name of a
file to be opened and parsed.  If the file can't be opened for
reading, then the constructor will return an undefined value and $!
will tell you why it failed.

If the argument is a reference to a plain scalar, then this scalar is
taken to be the literal document to parse.  The value of this
scalar should not be changed before all tokens have been extracted.

Otherwise the argument is taken to be some object that the
C<HTML::TokeParser> can read() from when it needs more data.  Typically
it will be a filehandle of some kind.  The stream will be read() until
EOF, but not closed.

=item $p->get_token

This method will return the next I<token> found in the HTML document,
or C<undef> at the end of the document.  The token is returned as an
array reference.  The first element of the array will be a (mostly)
single character string denoting the type of this token: "S" for start
tag, "E" for end tag, "T" for text, "C" for comment, "D" for
declaration, and "PI" for process instructions.  The rest of the array
is the same as the arguments passed to the corresponding HTML::Parser
v2 compatible callbacks (see L<HTML::Parser>).  In summary, returned
tokens look like this:

  ["S",  $tag, $attr, $attrseq, $text]
  ["E",  $tag, $text]
  ["T",  $text, $is_data]
  ["C",  $text]
  ["D",  $text]
  ["PI", $token0, $text]

where $attr is a hash reference, $attrseq is an array reference and
the rest is plain scalars.

=item $p->unget_token($token,...)

If you find out you have read too many tokens you can push them back,
so that they are returned the next time $p->get_token is called.

=item $p->get_tag( [$tag, ...] )

This method returns the next start or end tag (skipping any other
tokens), or C<undef> if there are no more tags in the document.  If
one or more arguments are given, then we skip tokens until one of the
specified tag types is found.  For example:

   $p->get_tag("font", "/font");

will find the next start or end tag for a font-element.

The tag information is returned as an array reference in the same form
as for $p->get_token above, but the type code (first element) is
missing. A start tag will be returned like this:

  [$tag, $attr, $attrseq, $text]

The tagname of end tags are prefixed with "/", i.e. end tag is
returned like this:

  ["/$tag", $text]

=item $p->get_text( [$endtag] )

This method returns all text found at the current position. It will
return a zero length string if the next token is not text.  The
optional $endtag argument specifies that any text occurring before the
given tag is to be returned.  Any entities will be converted to their
corresponding character.

The $p->{textify} attribute is a hash that defines how certain tags can
be treated as text.  If the name of a start tag matches a key in this
hash then this tag is converted to text.  The hash value is used to
specify which tag attribute to obtain the text from.  If this tag
attribute is missing, then the upper case name of the tag enclosed in
brackets is returned, e.g. "[IMG]".  The hash value can also be a
subroutine reference.  In this case the routine is called with the
start tag token content as its argument and the return value is treated
as the text.

The default $p->{textify} value is:

  {img => "alt", applet => "alt"}

This means that <IMG> and <APPLET> tags are treated as text, and that
the text to substitute can be found in the ALT attribute.

=item $p->get_trimmed_text( [$endtag] )

Same as $p->get_text above, but will collapse any sequences of white
space to a single space character.  Leading and trailing white space is
removed.

=back

=head1 EXAMPLES

This example extracts all links from a document.  It will print one
line for each link, containing the URL and the textual description
between the <A>...</A> tags:

  use HTML::TokeParser;
  $p = HTML::TokeParser->new(shift||"index.html");

  while (my $token = $p->get_tag("a")) {
      my $url = $token->[1]{href} || "-";
      my $text = $p->get_trimmed_text("/a");
      print "$url\t$text\n";
  }

This example extract the <TITLE> from the document:

  use HTML::TokeParser;
  $p = HTML::TokeParser->new(shift||"index.html");
  if ($p->get_tag("title")) {
      my $title = $p->get_trimmed_text;
      print "Title: $title\n";
  }

=head1 SEE ALSO

L<HTML::PullParser>, L<HTML::Parser>

=head1 COPYRIGHT

Copyright 1998-2001 Gisle Aas.

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut
Commit	Line	Data
86530b38 AT	1	package HTML::TokeParser;
	2
	3	# $Id: TokeParser.pm,v 2.24 2001/03/26 07:32:17 gisle Exp $
	4
	5	require HTML::PullParser;
	6	@ISA=qw(HTML::PullParser);
	7	$VERSION = sprintf("%d.%02d", q$Revision: 2.24 $ =~ /(\d+)\.(\d+)/);
	8
	9	use strict;
	10	use Carp ();
	11	use HTML::Entities qw(decode_entities);
	12
	13	my %ARGS =
	14	(
	15	start => "'S',tagname,attr,attrseq,text",
	16	end => "'E',tagname,text",
	17	text => "'T',text,is_cdata",
	18	process => "'PI',token0,text",
	19	comment => "'C',text",
	20	declaration => "'D',text",
	21	);
	22
	23
	24	sub new
	25	{
	26	my $class = shift;
	27	my %cnf;
	28	if (@_ == 1) {
	29	my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
	30	%cnf = ($type => $_[0]);
	31	}
	32	else {
	33	%cnf = @_;
	34	}
	35
	36	my $textify = delete $cnf{textify} \|\| {img => "alt", applet => "alt"};
	37
	38	my $self = $class->SUPER::new(%cnf, %ARGS) \|\| return undef;
	39
	40	$self->{textify} = $textify;
	41	$self;
	42	}
	43
	44
	45	sub get_tag
	46	{
	47	my $self = shift;
	48	my $token;
	49	while (1) {
	50	$token = $self->get_token \|\| return undef;
	51	my $type = shift @$token;
	52	next unless $type eq "S" \|\| $type eq "E";
	53	substr($token->[0], 0, 0) = "/" if $type eq "E";
	54	return $token unless @_;
	55	for (@_) {
	56	return $token if $token->[0] eq $_;
	57	}
	58	}
	59	}
	60
	61
	62	sub get_text
	63	{
	64	my $self = shift;
65	my $endat = shift;
66	my @text;
67	while (my $token = $self->get_token) {
68	my $type = $token->[0];
69	if ($type eq "T") {
70	my $text = $token->[1];
71	decode_entities($text) unless $token->[2];
72	push(@text, $text);
73	} elsif ($type =~ /^[SE]$/) {
74	my $tag = $token->[1];
75	if ($type eq "S") {
76	if (exists $self->{textify}{$tag}) {
77	my $alt = $self->{textify}{$tag};
78	my $text;
79	if (ref($alt)) {
80	$text = &$alt(@$token);
81	} else {
82	$text = $token->[2]{$alt \|\| "alt"};
83	$text = "[\U$tag]" unless defined $text;
84	}
85	push(@text, $text);
86	next;
87	}
88	} else {
89	$tag = "/$tag";
90	}
91	if (!defined($endat) \|\| $endat eq $tag) {
92	$self->unget_token($token);
93	last;
94	}
95	}
96	}
97	join("", @text);
98	}
99
100
101	sub get_trimmed_text
102	{
103	my $self = shift;
104	my $text = $self->get_text(@_);
105	$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
106	$text;
107	}
108
109	1;
110
111
112	__END__
113
114	=head1 NAME
115
116	HTML::TokeParser - Alternative HTML::Parser interface
117
118	=head1 SYNOPSIS
119
120	require HTML::TokeParser;
121	$p = HTML::TokeParser->new("index.html") \|\| die "Can't open: $!";
122	while (my $token = $p->get_token) {
123	#...
124	}
125
126	=head1 DESCRIPTION
127
128	The C<HTML::TokeParser> is an alternative interface to the
129	C<HTML::Parser> class. It is an C<HTML::PullParser> subclass.
130
131	The following methods are available:
132
133	=over 4
134
135	=item $p = HTML::TokeParser->new( $file_or_doc );
136
137	The object constructor argument is either a file name, a file handle
138	object, or the complete document to be parsed.
139
140	If the argument is a plain scalar, then it is taken as the name of a
141	file to be opened and parsed. If the file can't be opened for
142	reading, then the constructor will return an undefined value and $!
143	will tell you why it failed.
144
145	If the argument is a reference to a plain scalar, then this scalar is
146	taken to be the literal document to parse. The value of this
147	scalar should not be changed before all tokens have been extracted.
148
149	Otherwise the argument is taken to be some object that the
150	C<HTML::TokeParser> can read() from when it needs more data. Typically
151	it will be a filehandle of some kind. The stream will be read() until
152	EOF, but not closed.
153
154	=item $p->get_token
155
156	This method will return the next I<token> found in the HTML document,
157	or C<undef> at the end of the document. The token is returned as an
158	array reference. The first element of the array will be a (mostly)
159	single character string denoting the type of this token: "S" for start
160	tag, "E" for end tag, "T" for text, "C" for comment, "D" for
161	declaration, and "PI" for process instructions. The rest of the array
162	is the same as the arguments passed to the corresponding HTML::Parser
163	v2 compatible callbacks (see L<HTML::Parser>). In summary, returned
164	tokens look like this:
165
166	["S", $tag, $attr, $attrseq, $text]
167	["E", $tag, $text]
168	["T", $text, $is_data]
169	["C", $text]
170	["D", $text]
171	["PI", $token0, $text]
172
173	where $attr is a hash reference, $attrseq is an array reference and
174	the rest is plain scalars.
175
176	=item $p->unget_token($token,...)
177
178	If you find out you have read too many tokens you can push them back,
179	so that they are returned the next time $p->get_token is called.
180
181	=item $p->get_tag( [$tag, ...] )
182
183	This method returns the next start or end tag (skipping any other
184	tokens), or C<undef> if there are no more tags in the document. If
185	one or more arguments are given, then we skip tokens until one of the
186	specified tag types is found. For example:
187
188	$p->get_tag("font", "/font");
189
190	will find the next start or end tag for a font-element.
191
192	The tag information is returned as an array reference in the same form
193	as for $p->get_token above, but the type code (first element) is
194	missing. A start tag will be returned like this:
195
196	[$tag, $attr, $attrseq, $text]
197
198	The tagname of end tags are prefixed with "/", i.e. end tag is
199	returned like this:
200
201	["/$tag", $text]
202
203	=item $p->get_text( [$endtag] )
204
205	This method returns all text found at the current position. It will
206	return a zero length string if the next token is not text. The
207	optional $endtag argument specifies that any text occurring before the
208	given tag is to be returned. Any entities will be converted to their
209	corresponding character.
210
211	The $p->{textify} attribute is a hash that defines how certain tags can
212	be treated as text. If the name of a start tag matches a key in this
213	hash then this tag is converted to text. The hash value is used to
214	specify which tag attribute to obtain the text from. If this tag
215	attribute is missing, then the upper case name of the tag enclosed in
216	brackets is returned, e.g. "[IMG]". The hash value can also be a
217	subroutine reference. In this case the routine is called with the
218	start tag token content as its argument and the return value is treated
219	as the text.
220
221	The default $p->{textify} value is:
222
223	{img => "alt", applet => "alt"}
224
225	This means that <IMG> and <APPLET> tags are treated as text, and that
226	the text to substitute can be found in the ALT attribute.
227
228	=item $p->get_trimmed_text( [$endtag] )
229
230	Same as $p->get_text above, but will collapse any sequences of white
231	space to a single space character. Leading and trailing white space is
232	removed.
233
234	=back
235
236	=head1 EXAMPLES
237
238	This example extracts all links from a document. It will print one
239	line for each link, containing the URL and the textual description
240	between the <A>...</A> tags:
241
242	use HTML::TokeParser;
243	$p = HTML::TokeParser->new(shift\|\|"index.html");
244
245	while (my $token = $p->get_tag("a")) {
246	my $url = $token->[1]{href} \|\| "-";
247	my $text = $p->get_trimmed_text("/a");
248	print "$url\t$text\n";
249	}
250
251	This example extract the <TITLE> from the document:
252
253	use HTML::TokeParser;
254	$p = HTML::TokeParser->new(shift\|\|"index.html");
255	if ($p->get_tag("title")) {
256	my $title = $p->get_trimmed_text;
257	print "Title: $title\n";
258	}
259
260	=head1 SEE ALSO
261
262	L<HTML::PullParser>, L<HTML::Parser>
263
264	=head1 COPYRIGHT
265
266	Copyright 1998-2001 Gisle Aas.
267
268	This library is free software; you can redistribute it and/or
269	modify it under the same terms as Perl itself.
270
271	=cut