git.subgeniuskitty.com - OpenSPARC-T2-DV/.git/blame_incremental - tools/perl-5.8.0/man/man3/HTML::TokeParser.3

... / ...

Commit	Line	Data
	1	.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
	2	.\"
	3	.\" Standard preamble:
	4	.\" ========================================================================
	5	.de Sh \" Subsection heading
	6	.br
	7	.if t .Sp
	8	.ne 5
	9	.PP
	10	\fB\\$1\fR
	11	.PP
	12	..
	13	.de Sp \" Vertical space (when we can't use .PP)
	14	.if t .sp .5v
	15	.if n .sp
	16	..
	17	.de Vb \" Begin verbatim text
	18	.ft CW
	19	.nf
	20	.ne \\$1
	21	..
	22	.de Ve \" End verbatim text
	23	.ft R
	24	.fi
	25	..
	26	.\" Set up some character translations and predefined strings. \*(-- will
	27	.\" give an unbreakable dash, \(PI will give pi, \(L" will give a left
	28	.\" double quote, and \*(R" will give a right double quote. \| will give a
	29	.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
	30	.\" do unbreakable dashes and therefore won't be available. \(C` and \(C'
	31	.\" expand to `' in nroff, nothing in troff, for use with C<>.
	32	.tr \(W-\|\(bv\(Tr
	33	.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
	34	.ie n \{\
	35	. ds -- \(*W-
	36	. ds PI pi
	37	. if (\n(.H=4u)&(1m=24u) .ds -- \(W\h'-12u'\(W\h'-12u'-\" diablo 10 pitch
	38	. if (\n(.H=4u)&(1m=20u) .ds -- \(W\h'-12u'\(W\h'-8u'-\" diablo 12 pitch
	39	. ds L" ""
	40	. ds R" ""
	41	. ds C` ""
	42	. ds C' ""
	43	'br\}
	44	.el\{\
	45	. ds -- \\|\(em\\|
	46	. ds PI \(*p
	47	. ds L" ``
	48	. ds R" ''
	49	'br\}
	50	.\"
	51	.\" If the F register is turned on, we'll generate index entries on stderr for
	52	.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
	53	.\" entries marked with X<> in POD. Of course, you'll have to process the
	54	.\" output yourself in some meaningful fashion.
	55	.if \nF \{\
	56	. de IX
	57	. tm Index:\\$1\t\\n%\t"\\$2"
	58	..
	59	. nr % 0
	60	. rr F
	61	.\}
	62	.\"
	63	.\" For nroff, turn off justification. Always turn off hyphenation; it makes
	64	.\" way too many mistakes in technical documents.
	65	.hy 0
	66	.if n .na
	67	.\"
	68	.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
	69	.\" Fear. Run. Save yourself. No user-serviceable parts.
	70	. \" fudge factors for nroff and troff
	71	.if n \{\
	72	. ds #H 0
	73	. ds #V .8m
	74	. ds #F .3m
	75	. ds #[ \f1
	76	. ds #] \fP
	77	.\}
	78	.if t \{\
	79	. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
	80	. ds #V .6m
	81	. ds #F 0
	82	. ds #[ \&
	83	. ds #] \&
	84	.\}
	85	. \" simple accents for nroff and troff
	86	.if n \{\
	87	. ds ' \&
	88	. ds ` \&
	89	. ds ^ \&
	90	. ds , \&
	91	. ds ~ ~
	92	. ds /
	93	.\}
	94	.if t \{\
	95	. ds ' \\k:\h'-(\\n(.wu8/10-\(#H)'\'\h"\|\\n:u"
	96	. ds ` \\k:\h'-(\\n(.wu8/10-\(#H)'\`\h'\|\\n:u'
	97	. ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'^\h'\|\\n:u'
	98	. ds , \\k:\h'-(\\n(.wu*8/10)',\h'\|\\n:u'
	99	. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'\|\\n:u'
	100	. ds / \\k:\h'-(\\n(.wu8/10-\(#H)'\z\(sl\h'\|\\n:u'
	101	.\}
	102	. \" troff and (daisy-wheel) nroff accents
	103	.ds : \\k:\h'-(\\n(.wu8/10-\(#H+.1m+\(#F)'\v'-\(#V'\z.\h'.2m+\(#F'.\h'\|\\n:u'\v'\(#V'
	104	.ds 8 \h'\(#H'\(b\h'-\*(#H'
	105	.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\(#H)/2u'\v'-.3n'\(#[\z\(de\v'.3n'\h'\|\\n:u'\*(#]
	106	.ds d- \h'\(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\(#H'
	107	.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'\|\\n:u'
	108	.ds th \(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u2/3)'\s-1o\s+1\*(#]
	109	.ds Th \(#[\s+2I\s-2\h'-\w'I'u3/5'\v'-.3m'o\v'.3m'\*(#]
	110	.ds ae a\h'-(\w'a'u*4/10)'e
	111	.ds Ae A\h'-(\w'A'u*4/10)'E
	112	. \" corrections for vroff
	113	.if v .ds ~ \\k:\h'-(\\n(.wu9/10-\(#H)'\s-2\u~\d\s+2\h'\|\\n:u'
	114	.if v .ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'\v'-.4m'^\v'.4m'\h'\|\\n:u'
	115	. \" for low resolution devices (crt and lpr)
	116	.if \n(.H>23 .if \n(.V>19 \
	117	\{\
	118	. ds : e
	119	. ds 8 ss
	120	. ds o a
	121	. ds d- d\h'-1'\(ga
	122	. ds D- D\h'-1'\(hy
	123	. ds th \o'bp'
	124	. ds Th \o'LP'
	125	. ds ae ae
	126	. ds Ae AE
	127	.\}
	128	.rm #[ #] #H #V #F C
	129	.\" ========================================================================
	130	.\"
	131	.IX Title "HTML::TokeParser 3"
	132	.TH HTML::TokeParser 3 "2001-04-10" "perl v5.8.0" "User Contributed Perl Documentation"
	133	.SH "NAME"
	134	HTML::TokeParser \- Alternative HTML::Parser interface
	135	.SH "SYNOPSIS"
	136	.IX Header "SYNOPSIS"
	137	.Vb 5
	138	\& require HTML::TokeParser;
	139	\& $p = HTML::TokeParser->new("index.html") \|\| die "Can't open: $!";
	140	\& while (my $token = $p->get_token) {
	141	\& #...
	142	\& }
	143	.Ve
	144	.SH "DESCRIPTION"
	145	.IX Header "DESCRIPTION"
	146	The \f(CW\(C`HTML::TokeParser\(C'\fR is an alternative interface to the
	147	\&\f(CW\(C`HTML::Parser\(C'\fR class. It is an \f(CW\(C`HTML::PullParser\(C'\fR subclass.
	148	.PP
	149	The following methods are available:
	150	.ie n .IP "$p = HTML::TokeParser\->new( $file_or_doc );" 4
	151	.el .IP "$p = HTML::TokeParser\->new( \f(CW$file_or_doc\fR );" 4
	152	.IX Item "$p = HTML::TokeParser->new( $file_or_doc );"
	153	The object constructor argument is either a file name, a file handle
	154	object, or the complete document to be parsed.
	155	.Sp
	156	If the argument is a plain scalar, then it is taken as the name of a
	157	file to be opened and parsed. If the file can't be opened for
	158	reading, then the constructor will return an undefined value and $!
	159	will tell you why it failed.
	160	.Sp
	161	If the argument is a reference to a plain scalar, then this scalar is
	162	taken to be the literal document to parse. The value of this
	163	scalar should not be changed before all tokens have been extracted.
	164	.Sp
	165	Otherwise the argument is taken to be some object that the
	166	\&\f(CW\(C`HTML::TokeParser\(C'\fR can \fIread()\fR from when it needs more data. Typically
	167	it will be a filehandle of some kind. The stream will be \fIread()\fR until
	168	\&\s-1EOF\s0, but not closed.
	169	.IP "$p\->get_token" 4
	170	.IX Item "$p->get_token"
	171	This method will return the next \fItoken\fR found in the \s-1HTML\s0 document,
	172	or \f(CW\(C`undef\(C'\fR at the end of the document. The token is returned as an
	173	array reference. The first element of the array will be a (mostly)
	174	single character string denoting the type of this token: \(L"S\(R" for start
	175	tag, \(L"E\(R" for end tag, \(L"T\(R" for text, \(L"C\(R" for comment, \(L"D\(R" for
	176	declaration, and \(L"\s-1PI\s0\(R" for process instructions. The rest of the array
	177	is the same as the arguments passed to the corresponding HTML::Parser
	178	v2 compatible callbacks (see HTML::Parser). In summary, returned
	179	tokens look like this:
	180	.Sp
	181	.Vb 6
	182	\& ["S", $tag, $attr, $attrseq, $text]
	183	\& ["E", $tag, $text]
	184	\& ["T", $text, $is_data]
	185	\& ["C", $text]
	186	\& ["D", $text]
	187	\& ["PI", $token0, $text]
	188	.Ve
	189	.Sp
	190	where \f(CW$attr\fR is a hash reference, \f(CW$attrseq\fR is an array reference and
	191	the rest is plain scalars.
	192	.IP "$p\->unget_token($token,...)" 4
	193	.IX Item "$p->unget_token($token,...)"
	194	If you find out you have read too many tokens you can push them back,
	195	so that they are returned the next time \f(CW$p\fR\->get_token is called.
	196	.IP "$p\->get_tag( [$tag, ...] )" 4
	197	.IX Item "$p->get_tag( [$tag, ...] )"
	198	This method returns the next start or end tag (skipping any other
	199	tokens), or \f(CW\(C`undef\(C'\fR if there are no more tags in the document. If
	200	one or more arguments are given, then we skip tokens until one of the
	201	specified tag types is found. For example:
	202	.Sp
	203	.Vb 1
	204	\& $p->get_tag("font", "/font");
	205	.Ve
	206	.Sp
	207	will find the next start or end tag for a font\-element.
	208	.Sp
	209	The tag information is returned as an array reference in the same form
	210	as for \f(CW$p\fR\->get_token above, but the type code (first element) is
	211	missing. A start tag will be returned like this:
	212	.Sp
	213	.Vb 1
	214	\& [$tag, $attr, $attrseq, $text]
	215	.Ve
	216	.Sp
	217	The tagname of end tags are prefixed with \(L"/\(R", i.e. end tag is
	218	returned like this:
	219	.Sp
	220	.Vb 1
	221	\& ["/$tag", $text]
	222	.Ve
	223	.IP "$p\->get_text( [$endtag] )" 4
	224	.IX Item "$p->get_text( [$endtag] )"
	225	This method returns all text found at the current position. It will
	226	return a zero length string if the next token is not text. The
	227	optional \f(CW$endtag\fR argument specifies that any text occurring before the
	228	given tag is to be returned. Any entities will be converted to their
	229	corresponding character.
	230	.Sp
	231	The \f(CW$p\fR\->{textify} attribute is a hash that defines how certain tags can
	232	be treated as text. If the name of a start tag matches a key in this
	233	hash then this tag is converted to text. The hash value is used to
	234	specify which tag attribute to obtain the text from. If this tag
	235	attribute is missing, then the upper case name of the tag enclosed in
	236	brackets is returned, e.g. \(L"[\s-1IMG\s0]\(R". The hash value can also be a
	237	subroutine reference. In this case the routine is called with the
	238	start tag token content as its argument and the return value is treated
	239	as the text.
	240	.Sp
	241	The default \f(CW$p\fR\->{textify} value is:
	242	.Sp
	243	.Vb 1
	244	\& {img => "alt", applet => "alt"}
	245	.Ve
	246	.Sp
	247	This means that <\s-1IMG\s0> and <\s-1APPLET\s0> tags are treated as text, and that
	248	the text to substitute can be found in the \s-1ALT\s0 attribute.
	249	.IP "$p\->get_trimmed_text( [$endtag] )" 4
	250	.IX Item "$p->get_trimmed_text( [$endtag] )"
	251	Same as \f(CW$p\fR\->get_text above, but will collapse any sequences of white
	252	space to a single space character. Leading and trailing white space is
	253	removed.
	254	.SH "EXAMPLES"
	255	.IX Header "EXAMPLES"
	256	This example extracts all links from a document. It will print one
	257	line for each link, containing the \s-1URL\s0 and the textual description
	258	between the <A>...</A> tags:
	259	.PP
	260	.Vb 2
	261	\& use HTML::TokeParser;
	262	\& $p = HTML::TokeParser->new(shift\|\|"index.html");
	263	.Ve
	264	.PP
	265	.Vb 5
	266	\& while (my $token = $p->get_tag("a")) {
	267	\& my $url = $token->[1]{href} \|\| "-";
	268	\& my $text = $p->get_trimmed_text("/a");
	269	\& print "$url\et$text\en";
	270	\& }
	271	.Ve
	272	.PP
	273	This example extract the <\s-1TITLE\s0> from the document:
	274	.PP
	275	.Vb 6
	276	\& use HTML::TokeParser;
	277	\& $p = HTML::TokeParser->new(shift\|\|"index.html");
	278	\& if ($p->get_tag("title")) {
	279	\& my $title = $p->get_trimmed_text;
	280	\& print "Title: $title\en";
	281	\& }
	282	.Ve
	283	.SH "SEE ALSO"
	284	.IX Header "SEE ALSO"
	285	HTML::PullParser, HTML::Parser
	286	.SH "COPYRIGHT"
	287	.IX Header "COPYRIGHT"
	288	Copyright 1998\-2001 Gisle Aas.
	289	.PP
	290	This library is free software; you can redistribute it and/or
	291	modify it under the same terms as Perl itself.