git.subgeniuskitty.com - OpenSPARC-T2-DV/.git/blame_incremental - tools/perl-5.8.0/lib/site_perl/5.8.0/sun4-solaris/HTML/TokeParser.pm

... / ...

Commit	Line	Data
	1	package HTML::TokeParser;
	2
	3	# $Id: TokeParser.pm,v 2.24 2001/03/26 07:32:17 gisle Exp $
	4
	5	require HTML::PullParser;
	6	@ISA=qw(HTML::PullParser);
	7	$VERSION = sprintf("%d.%02d", q$Revision: 2.24 $ =~ /(\d+)\.(\d+)/);
	8
	9	use strict;
	10	use Carp ();
	11	use HTML::Entities qw(decode_entities);
	12
	13	my %ARGS =
	14	(
	15	start => "'S',tagname,attr,attrseq,text",
	16	end => "'E',tagname,text",
	17	text => "'T',text,is_cdata",
	18	process => "'PI',token0,text",
	19	comment => "'C',text",
	20	declaration => "'D',text",
	21	);
	22
	23
	24	sub new
	25	{
	26	my $class = shift;
	27	my %cnf;
	28	if (@_ == 1) {
	29	my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
	30	%cnf = ($type => $_[0]);
	31	}
	32	else {
	33	%cnf = @_;
	34	}
	35
	36	my $textify = delete $cnf{textify} \|\| {img => "alt", applet => "alt"};
	37
	38	my $self = $class->SUPER::new(%cnf, %ARGS) \|\| return undef;
	39
	40	$self->{textify} = $textify;
	41	$self;
	42	}
	43
	44
	45	sub get_tag
	46	{
	47	my $self = shift;
	48	my $token;
	49	while (1) {
	50	$token = $self->get_token \|\| return undef;
	51	my $type = shift @$token;
	52	next unless $type eq "S" \|\| $type eq "E";
	53	substr($token->[0], 0, 0) = "/" if $type eq "E";
	54	return $token unless @_;
	55	for (@_) {
	56	return $token if $token->[0] eq $_;
	57	}
	58	}
	59	}
	60
	61
	62	sub get_text
	63	{
	64	my $self = shift;
	65	my $endat = shift;
	66	my @text;
	67	while (my $token = $self->get_token) {
	68	my $type = $token->[0];
	69	if ($type eq "T") {
	70	my $text = $token->[1];
	71	decode_entities($text) unless $token->[2];
	72	push(@text, $text);
	73	} elsif ($type =~ /^[SE]$/) {
	74	my $tag = $token->[1];
	75	if ($type eq "S") {
	76	if (exists $self->{textify}{$tag}) {
	77	my $alt = $self->{textify}{$tag};
	78	my $text;
	79	if (ref($alt)) {
	80	$text = &$alt(@$token);
	81	} else {
	82	$text = $token->[2]{$alt \|\| "alt"};
	83	$text = "[\U$tag]" unless defined $text;
	84	}
	85	push(@text, $text);
	86	next;
	87	}
	88	} else {
	89	$tag = "/$tag";
	90	}
	91	if (!defined($endat) \|\| $endat eq $tag) {
	92	$self->unget_token($token);
	93	last;
	94	}
	95	}
	96	}
	97	join("", @text);
	98	}
	99
	100
	101	sub get_trimmed_text
	102	{
	103	my $self = shift;
	104	my $text = $self->get_text(@_);
	105	$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
	106	$text;
	107	}
	108
	109	1;
	110
	111
	112	__END__
	113
	114	=head1 NAME
	115
	116	HTML::TokeParser - Alternative HTML::Parser interface
	117
	118	=head1 SYNOPSIS
	119
	120	require HTML::TokeParser;
	121	$p = HTML::TokeParser->new("index.html") \|\| die "Can't open: $!";
	122	while (my $token = $p->get_token) {
	123	#...
	124	}
	125
	126	=head1 DESCRIPTION
	127
	128	The C<HTML::TokeParser> is an alternative interface to the
	129	C<HTML::Parser> class. It is an C<HTML::PullParser> subclass.
	130
	131	The following methods are available:
	132
	133	=over 4
	134
	135	=item $p = HTML::TokeParser->new( $file_or_doc );
	136
	137	The object constructor argument is either a file name, a file handle
	138	object, or the complete document to be parsed.
	139
	140	If the argument is a plain scalar, then it is taken as the name of a
	141	file to be opened and parsed. If the file can't be opened for
	142	reading, then the constructor will return an undefined value and $!
	143	will tell you why it failed.
	144
	145	If the argument is a reference to a plain scalar, then this scalar is
	146	taken to be the literal document to parse. The value of this
	147	scalar should not be changed before all tokens have been extracted.
	148
	149	Otherwise the argument is taken to be some object that the
	150	C<HTML::TokeParser> can read() from when it needs more data. Typically
	151	it will be a filehandle of some kind. The stream will be read() until
	152	EOF, but not closed.
	153
	154	=item $p->get_token
	155
	156	This method will return the next I<token> found in the HTML document,
	157	or C<undef> at the end of the document. The token is returned as an
	158	array reference. The first element of the array will be a (mostly)
	159	single character string denoting the type of this token: "S" for start
	160	tag, "E" for end tag, "T" for text, "C" for comment, "D" for
	161	declaration, and "PI" for process instructions. The rest of the array
	162	is the same as the arguments passed to the corresponding HTML::Parser
	163	v2 compatible callbacks (see L<HTML::Parser>). In summary, returned
	164	tokens look like this:
	165
	166	["S", $tag, $attr, $attrseq, $text]
	167	["E", $tag, $text]
	168	["T", $text, $is_data]
	169	["C", $text]
	170	["D", $text]
	171	["PI", $token0, $text]
	172
	173	where $attr is a hash reference, $attrseq is an array reference and
	174	the rest is plain scalars.
	175
	176	=item $p->unget_token($token,...)
	177
	178	If you find out you have read too many tokens you can push them back,
	179	so that they are returned the next time $p->get_token is called.
	180
	181	=item $p->get_tag( [$tag, ...] )
	182
	183	This method returns the next start or end tag (skipping any other
	184	tokens), or C<undef> if there are no more tags in the document. If
	185	one or more arguments are given, then we skip tokens until one of the
	186	specified tag types is found. For example:
	187
	188	$p->get_tag("font", "/font");
	189
	190	will find the next start or end tag for a font-element.
	191
	192	The tag information is returned as an array reference in the same form
	193	as for $p->get_token above, but the type code (first element) is
	194	missing. A start tag will be returned like this:
	195
	196	[$tag, $attr, $attrseq, $text]
	197
	198	The tagname of end tags are prefixed with "/", i.e. end tag is
	199	returned like this:
	200
	201	["/$tag", $text]
	202
	203	=item $p->get_text( [$endtag] )
	204
	205	This method returns all text found at the current position. It will
	206	return a zero length string if the next token is not text. The
	207	optional $endtag argument specifies that any text occurring before the
	208	given tag is to be returned. Any entities will be converted to their
	209	corresponding character.
	210
	211	The $p->{textify} attribute is a hash that defines how certain tags can
	212	be treated as text. If the name of a start tag matches a key in this
	213	hash then this tag is converted to text. The hash value is used to
	214	specify which tag attribute to obtain the text from. If this tag
	215	attribute is missing, then the upper case name of the tag enclosed in
	216	brackets is returned, e.g. "[IMG]". The hash value can also be a
	217	subroutine reference. In this case the routine is called with the
	218	start tag token content as its argument and the return value is treated
	219	as the text.
	220
	221	The default $p->{textify} value is:
	222
	223	{img => "alt", applet => "alt"}
	224
	225	This means that <IMG> and <APPLET> tags are treated as text, and that
	226	the text to substitute can be found in the ALT attribute.
	227
	228	=item $p->get_trimmed_text( [$endtag] )
	229
	230	Same as $p->get_text above, but will collapse any sequences of white
	231	space to a single space character. Leading and trailing white space is
	232	removed.
	233
	234	=back
	235
	236	=head1 EXAMPLES
	237
	238	This example extracts all links from a document. It will print one
	239	line for each link, containing the URL and the textual description
	240	between the <A>...</A> tags:
	241
	242	use HTML::TokeParser;
	243	$p = HTML::TokeParser->new(shift\|\|"index.html");
	244
	245	while (my $token = $p->get_tag("a")) {
	246	my $url = $token->[1]{href} \|\| "-";
	247	my $text = $p->get_trimmed_text("/a");
	248	print "$url\t$text\n";
	249	}
	250
	251	This example extract the <TITLE> from the document:
	252
	253	use HTML::TokeParser;
	254	$p = HTML::TokeParser->new(shift\|\|"index.html");
	255	if ($p->get_tag("title")) {
	256	my $title = $p->get_trimmed_text;
	257	print "Title: $title\n";
	258	}
	259
	260	=head1 SEE ALSO
	261
	262	L<HTML::PullParser>, L<HTML::Parser>
	263
	264	=head1 COPYRIGHT
	265
	266	Copyright 1998-2001 Gisle Aas.
	267
	268	This library is free software; you can redistribute it and/or
	269	modify it under the same terms as Perl itself.
	270
	271	=cut