git.subgeniuskitty.com - OpenSPARC-T2-DV/.git/blame_incremental - tools/perl-5.8.0/lib/site

... / ...

Commit	Line	Data
	1	package HTML::Parser;
	2
	3	# Copyright 1996-2003, Gisle Aas.
	4	# Copyright 1999-2000, Michael A. Chase.
	5	#
	6	# This library is free software; you can redistribute it and/or
	7	# modify it under the same terms as Perl itself.
	8
	9	use strict;
	10	use vars qw($VERSION @ISA);
	11
	12	$VERSION = '3.28'; # $Date: 2003/04/17 03:45:34 $
	13
	14	require HTML::Entities;
	15
	16	require DynaLoader;
	17	@ISA=qw(DynaLoader);
	18	HTML::Parser->bootstrap($VERSION);
	19
	20
	21	sub new
	22	{
	23	my $class = shift;
	24	my $self = bless {}, $class;
	25	return $self->init(@_);
	26	}
	27
	28
	29	sub init
	30	{
	31	my $self = shift;
	32	$self->_alloc_pstate;
	33
	34	my %arg = @_;
	35	my $api_version = delete $arg{api_version} \|\| (@_ ? 3 : 2);
	36	if ($api_version >= 4) {
	37	require Carp;
	38	Carp::croak("API version $api_version not supported " .
	39	"by HTML::Parser $VERSION");
	40	}
	41
	42	if ($api_version < 3) {
	43	# Set up method callbacks compatible with HTML-Parser-2.xx
	44	$self->handler(text => "text", "self,text,is_cdata");
	45	$self->handler(end => "end", "self,tagname,text");
	46	$self->handler(process => "process", "self,token0,text");
	47	$self->handler(start => "start",
	48	"self,tagname,attr,attrseq,text");
	49
	50	$self->handler(comment =>
	51	sub {
	52	my($self, $tokens) = @_;
	53	for (@$tokens) {
	54	$self->comment($_);
	55	}
	56	}, "self,tokens");
	57
	58	$self->handler(declaration =>
	59	sub {
	60	my $self = shift;
	61	$self->declaration(substr($_[0], 2, -1));
	62	}, "self,text");
	63	}
	64
	65	if (my $h = delete $arg{handlers}) {
	66	$h = {@$h} if ref($h) eq "ARRAY";
	67	while (my($event, $cb) = each %$h) {
	68	$self->handler($event => @$cb);
	69	}
	70	}
	71
	72	# In the end we try to assume plain attribute or handler
	73	while (my($option, $val) = each %arg) {
	74	if ($option =~ /^(\w+)_h$/) {
	75	$self->handler($1 => @$val);
	76	}
	77	elsif ($option =~ /^(text\|start\|end\|process\|declaration\|comment)$/) {
	78	require Carp;
	79	Carp::croak("Bad constructor option '$option'");
	80	}
	81	else {
	82	$self->$option($val);
	83	}
	84	}
	85
	86	return $self;
	87	}
	88
	89
	90	sub parse_file
	91	{
	92	my($self, $file) = @_;
	93	my $opened;
	94	if (!ref($file) && ref(\$file) ne "GLOB") {
	95	# Assume $file is a filename
	96	local(*F);
	97	open(F, $file) \|\| return undef;
	98	binmode(F); # should we? good for byte counts
	99	$opened++;
	100	$file = *F;
	101	}
	102	my $chunk = '';
	103	while (read($file, $chunk, 512)) {
	104	$self->parse($chunk) \|\| last;
	105	}
	106	close($file) if $opened;
	107	$self->eof;
	108	}
	109
	110
	111	sub netscape_buggy_comment # legacy
	112	{
	113	my $self = shift;
	114	require Carp;
	115	Carp::carp("netscape_buggy_comment() is deprecated. " .
	116	"Please use the strict_comment() method instead");
	117	my $old = !$self->strict_comment;
	118	$self->strict_comment(!shift) if @_;
	119	return $old;
	120	}
	121
	122	# set up method stubs
	123	sub text { }
	124	*start = \&text;
	125	*end = \&text;
	126	*comment = \&text;
	127	*declaration = \&text;
	128	*process = \&text;
	129
	130	1;
	131
	132	__END__
	133
	134
	135	=head1 NAME
	136
	137	HTML::Parser - HTML parser class
	138
	139	=head1 SYNOPSIS
	140
	141	use HTML::Parser ();
	142
	143	# Create parser object
	144	$p = HTML::Parser->new( api_version => 3,
	145	start_h => [\&start, "tagname, attr"],
	146	end_h => [\&end, "tagname"],
	147	marked_sections => 1,
	148	);
	149
	150	# Parse document text chunk by chunk
	151	$p->parse($chunk1);
	152	$p->parse($chunk2);
	153	#...
	154	$p->eof; # signal end of document
	155
	156	# Parse directly from file
	157	$p->parse_file("foo.html");
	158	# or
	159	open(F, "foo.html") \|\| die;
	160	$p->parse_file(*F);
	161
	162	HTML::Parser version 2 style subclassing and method callbacks:
	163
	164	{
	165	package MyParser;
	166	use base 'HTML::Parser';
	167
	168	sub start {
	169	my($self, $tagname, $attr, $attrseq, $origtext) = @_;
	170	#...
	171	}
	172
	173	sub end {
	174	my($self, $tagname, $origtext) = @_;
	175	#...
	176	}
	177
	178	sub text {
	179	my($self, $origtext, $is_cdata) = @_;
	180	#...
	181	}
	182	}
	183
	184	my $p = MyParser->new;
	185	$p->parse_file("foo.html");
	186
	187	=head1 DESCRIPTION
	188
	189	Objects of the C<HTML::Parser> class will recognize markup and
	190	separate it from plain text (alias data content) in HTML
	191	documents. As different kinds of markup and text are recognized, the
	192	corresponding event handlers are invoked.
	193
	194	C<HTML::Parser> in not a generic SGML parser. We have tried to
	195	make it able to deal with the HTML that is actually "out there", and
	196	it normally parses as closely as possible to the way the popular web
	197	browsers do it instead of strictly following one of the many HTML
	198	specifications from W3C. Where there is disagreement there is often
	199	an option that you can enable to get the official behaviour.
	200
	201	The document to be parsed may be supplied in arbitrary chunks. This
	202	makes on-the-fly parsing as documents are received from the network
	203	possible.
	204
	205	If event driven parsing does not feel right for your application, you
	206	might want to use C<HTML::PullParser>. It is a
	207	C<HTML::Parser> subclass that allows a more conventional program
	208	structure.
	209
	210
	211	=head1 METHODS
	212
	213	The following method is used to construct a new C<HTML::Parser> object:
	214
	215	=over
	216
	217	=item $p = HTML::Parser->new( %options_and_handlers )
	218
	219	This class method creates a new C<HTML::Parser> object and
	220	returns it. Key/value pair arguments may be provided to assign event
	221	handlers or initialize parser options. The handlers and parser
	222	options can also be set or modified later by method calls described below.
	223
	224	If a top level key is in the form "<event>_h" (e.g., "text_h"} then it
	225	assigns a handler to that event, otherwise it initializes a parser
	226	option. The event handler specification value must be an array
	227	reference. Multiple handlers may also be assigned with the 'handlers
	228	=> [%handlers]' option. See examples below.
	229
	230	If new() is called without any arguments, it will create a parser that
	231	uses callback methods compatible with version 2 of C<HTML::Parser>.
	232	See the section on "version 2 compatibility" below for details.
	233
	234	Special constructor option 'api_version => 2' can be used to
	235	initialize version 2 callbacks while still setting other options and
	236	handlers. The 'api_version => 3' option can be used if you don't want
	237	to set any options and don't want to fall back to v2 compatible
	238	mode.
	239
	240	Examples:
	241
	242	$p = HTML::Parser->new(api_version => 3,
	243	text_h => [ sub {...}, "dtext" ]);
	244
	245	This creates a new parser object with a text event handler subroutine
	246	that receives the original text with general entities decoded.
	247
	248	$p = HTML::Parser->new(api_version => 3,
	249	start_h => [ 'my_start', "self,tokens" ]);
	250
	251	This creates a new parser object with a start event handler method
	252	that receives the $p and the tokens array.
	253
	254	$p = HTML::Parser->new(api_version => 3,
	255	handlers => { text => [\@array, "event,text"],
	256	comment => [\@array, "event,text"],
	257	});
	258
	259	This creates a new parser object that stores the event type and the
	260	original text in @array for text and comment events.
	261
	262	=back
	263
	264	The following methods feed the HTML document
	265	to the C<HTML::Parser> object:
	266
	267	=over
	268
	269	=item $p->parse( $string )
	270
	271	Parse $string as the next chunk of the HTML document. The return
	272	value is normally a reference to the parser object (i.e. $p).
	273	Handlers invoked should not attempt modify the $string in-place until
	274	$p->parse returns.
	275
	276	If an invoked event handler aborts parsing by calling $p->eof, then
	277	$p->parse() will return a FALSE value.
	278
	279	=item $p->parse( $code_ref )
	280
	281	If a code reference is passed in as the argument to parse then the
	282	chunks to parse is obtained by invoking this function repeatedly.
	283	Parsing continues until the function returns an empty (or undefined)
	284	result. When this happens $p->eof is automatically signalled.
	285
	286	Parsing will also abort if one of the event handlers call $p->eof.
	287
	288	The effect of this is the same as:
	289
	290	while (1) {
	291	my $chunk = &$code_ref();
	292	if (!defined($chunk) \|\| !length($chunk)) {
	293	$p->eof;
	294	return $p;
	295	}
	296	$p->parse($chunk) \|\| return undef;
	297	}
	298
	299	But it is more efficient as this loop runs internally in XS code.
	300
	301	=item $p->parse_file( $file )
	302
	303	Parse text directly from a file. The $file argument can be a
	304	filename, an open file handle, or a reference to a an open file
	305	handle.
	306
	307	If $file contains a filename and the file can't be opened, then the
	308	method returns an undefined value and $! tells why it failed.
	309	Otherwise the return value is a reference to the parser object.
	310
	311	If a file handle is passed as the $file argument, then the file will
	312	normally be read until EOF, but not closed.
	313
	314	If an invoked event handler aborts parsing by calling $p->eof,
	315	then $p->parse_file() may not have read the entire file.
	316
	317	On systems with multi-byte line terminators, the values passed for the
	318	offset and length argspecs may be too low if parse_file() is called on
	319	a file handle that is not in binary mode.
	320
	321	If a filename is passed in, then parse_file() will open the file in
	322	binary mode.
	323
	324	=item $p->eof
	325
	326	Signals the end of the HTML document. Calling the $p->eof method
	327	outside a handler callback will flush any remaining buffered text
	328	(which triggers the C<text> event if there is any remaining text).
	329
	330	Calling $p->eof inside a handler will terminate parsing at that point
	331	and cause $p->parse to return a FALSE value. This also terminates
	332	parsing by $p->parse_file().
	333
	334	After $p->eof has been called, the parse() and parse_file() methods
	335	can be invoked to feed new documents with the parser object.
	336
	337	The return value from eof() is a reference to the parser object.
	338
	339	=back
	340
	341
	342	Most parser options are controlled by boolean attributes.
	343	Each boolean attribute is enabled by calling the corresponding method
	344	with a TRUE argument and disabled with a FALSE argument. The
	345	attribute value is left unchanged if no argument is given. The return
	346	value from each method is the old attribute value.
	347
	348	Methods that can be used to get and/or set parser options are:
	349
	350	=over
	351
	352	=item $p->strict_comment( [$bool] )
	353
	354	By default, comments are terminated by the first occurrence of "-->".
	355	This is the behaviour of most popular browsers (like Netscape and
	356	MSIE), but it is not correct according to the official HTML
	357	standard. Officially, you need an even number of "--" tokens before
	358	the closing ">" is recognized and there may not be anything but
	359	whitespace between an even and an odd "--".
	360
	361	The official behaviour is enabled by enabling this attribute.
	362
	363	=item $p->strict_names( [$bool] )
	364
	365	By default, almost anything is allowed in tag and attribute names.
	366	This is the behaviour of most popular browsers and allows us to parse
	367	some broken tags with invalid attr values like:
	368
	369	<IMG SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0>
	370
	371	By default, "LIST]" is parsed as a boolean attribute, not as
	372	part of the ALT value as was clearly intended. This is also what
	373	Netscape sees.
	374
	375	The official behaviour is enabled by enabling this attribute. If
	376	enabled, it will cause the tag above to be reported as text
	377	since "LIST]" is not a legal attribute name.
	378
	379	=item $p->boolean_attribute_value( $val )
	380
	381	This method sets the value reported for boolean attributes inside HTML
	382	start tags. By default, the name of the attribute is also used as its
	383	value. This affects the values reported for C<tokens> and C<attr>
	384	argspecs.
	385
	386	=item $p->xml_mode( [$bool] )
	387
	388	Enabling this attribute changes the parser to allow some XML
	389	constructs such as I<empty element tags> and I<XML processing
	390	instructions>. It disables forcing tag and attribute names to lower
	391	case when they are reported by the C<tagname> and C<attr> argspecs,
	392	and suppress special treatment of elements that are parsed as CDATA
	393	for HTML.
	394
	395	I<Empty element tags> look like start tags, but end with the character
	396	sequence "/>". When recognized by C<HTML::Parser> they cause an
	397	artificial end event in addition to the start event. The C<text> for
	398	the artificial end event will be empty and the C<tokenpos> array will
	399	be undefined even though the only element in the token array will have
	400	the correct tag name.
	401
	402	I<XML processing instructions> are terminated by "?>" instead of a
	403	simple ">" as is the case for HTML.
	404
	405	=item $p->unbroken_text( [$bool] )
	406
	407	By default, blocks of text are given to the text handler as soon as
	408	possible (but the parser makes sure to always break text at the
	409	boundary between whitespace and non-whitespace so single words and
	410	entities always can be decoded safely). This might create breaks that
	411	make it hard to do transformations on the text. When this attribute is
	412	enabled, blocks of text are always reported in one piece. This will
	413	delay the text event until the following (non-text) event has been
	414	recognized by the parser.
	415
	416	Note that the C<offset> argspec will give you the offset of the first
	417	segment of text and C<length> is the combined length of the segments.
	418	Since there might be ignored tags in between, these numbers can't be
	419	used to directly index in the original document file.
	420
	421	=item $p->marked_sections( [$bool] )
	422
	423	By default, section markings like <![CDATA[...]]> are treated like
	424	ordinary text. When this attribute is enabled section markings are
	425	honoured.
	426
	427	There are currently no events associated with the marked section
	428	markup, but the text can be returned as C<skipped_text>.
	429
	430	=item $p->attr_encoded( [$bool] )
	431
	432	By default, the C<attr> and C<@attr> argspecs will have general
	433	entities for attribute values decoded. Enabling this attribute leaves
	434	entities alone.
	435
	436	=item $p->case_sensitive( [$bool] )
	437
	438	By default, tagnames and attribute names are down-cased. Enabling this
	439	attribute leave them as found in the HTML source document.
	440
	441	=back
	442
	443	As markup and text is recognized, handlers are invoked. The following
	444	method is used to set up handlers for different events:
	445
	446	=over
	447
	448	=item $p->handler( event => \&subroutine, argspec )
	449
	450	=item $p->handler( event => method_name, argspec )
	451
	452	=item $p->handler( event => \@accum, argspec )
	453
	454	=item $p->handler( event => "" );
	455
	456	=item $p->handler( event => undef );
	457
	458	=item $p->handler( event );
	459
	460	This method assigns a subroutine, method, or array to handle an event.
	461
	462	Event is one of C<text>, C<start>, C<end>, C<declaration>, C<comment>,
	463	C<process>, C<start_document>, C<end_document> or C<default>.
	464
	465	I<Subroutine> is a reference to a subroutine which is called to handle
	466	the event.
	467
	468	I<Method_name> is the name of a method of $p which is called to handle
	469	the event.
	470
	471	I<Accum> is a array that will hold the event information as
	472	sub-arrays.
	473
	474	If the second argument is "", the event is ignored.
	475	If it is undef, the default handler is invoked for the event.
	476
	477	I<Argspec> is a string that describes the information to be reported
	478	for the event. Any requested information that does not apply to a
	479	specific event is passed as C<undef>. If argspec is omitted, then it
	480	is left unchanged since last update.
	481
	482	The return value from $p->handle is the old callback routine or a
	483	reference to the accumulator array.
	484
	485	Any return values from handler callback routines/methods are always
	486	ignored. A handler callback can request parsing to be aborted by
	487	invoking the $p->eof method. A handler callback is not allowed to
	488	invoke the $p->parse() or $p->parse_file() method. An exception will
	489	be raised if it tries.
	490
	491	Examples:
	492
	493	$p->handler(start => "start", 'self, attr, attrseq, text' );
	494
	495	This causes the "start" method of object $p to be called for 'start' events.
	496	The callback signature is $p->start(\%attr, \@attr_seq, $text).
	497
	498	$p->handler(start => \&start, 'attr, attrseq, text' );
	499
	500	This causes subroutine start() to be called for 'start' events.
	501	The callback signature is start(\%attr, \@attr_seq, $text).
	502
	503	$p->handler(start => \@accum, '"S", attr, attrseq, text' );
	504
	505	This causes 'start' event information to be saved in @accum.
	506	The array elements will be ['S', \%attr, \@attr_seq, $text].
	507
	508	$p->handler(start => "");
	509
	510	This causes 'start' events to be ignored. It also supresses
	511	invokations of any default handler for start events. It is in most
	512	cases equivalent to $p->handler(start => sub {}), but is more
	513	efficient. It is different from the empty-sub-handler in that
	514	C<skipped_text> is not reset by it.
	515
	516	$p->handler(start => undef);
	517
	518	This causes no handler to be assosiated with start events.
	519	If there is a default handler it will be invoked.
	520
	521	=back
	522
	523	Filters based on tags can be set up to limit the number of events
	524	reported. The main bottleneck during parsing is often the huge number
	525	of callbacks made from the parser. Applying filters can improve
	526	performance significantly.
	527
	528	The following methods control filters:
	529
	530	=over
	531
	532	=item $p->ignore_tags( TAG, ... )
	533
	534	Any C<start> and C<end> events involving any of the tags given are
	535	suppressed.
	536
	537	=item $p->report_tags( TAG, ... )
	538
	539	Any C<start> and C<end> events involving any of the tags I<not> given
	540	are suppressed.
	541
	542	=item $p->ignore_elements( TAG, ... )
	543
	544	Both the C<start> and the C<end> event as well as any events that
	545	would be reported in between are suppressed. The ignored elements can
	546	contain nested occurences of itself. Example:
	547
	548	$p->ignore_elements(qw(script style));
	549
	550	The C<script> and C<style> tags will always nest properly since their
	551	content is parsed in CDATA mode. For most other tags
	552	C<ignore_elements> must be used with caution since HTML is often not
	553	I<well formed>.
	554
	555	=back
	556
	557	=head2 Argspec
	558
	559	Argspec is a string containing a comma separated list that describes
	560	the information reported by the event. The following argspec
	561	identifier names can be used:
	562
	563	=over
	564
	565	=item C<self>
	566
	567	Self causes the current object to be passed to the handler. If the
	568	handler is a method, this must be the first element in the argspec.
	569
	570	An alternative to passing self as an argspec is to register closures
	571	that capture $self by themselves as handlers. Unfortunately this
	572	creates a circular references which prevents the HTML::Parser object
	573	from being garbage collected. Using the C<self> argspec avoids this
	574	problem.
	575
	576	=item C<tokens>
	577
	578	Tokens causes a reference to an array of token strings to be passed.
	579	The strings are exactly as they were found in the original text,
	580	no decoding or case changes are applied.
	581
	582	For C<declaration> events, the array contains each word, comment, and
	583	delimited string starting with the declaration type.
	584
	585	For C<comment> events, this contains each sub-comment. If
	586	$p->strict_comments is disabled, there will be only one sub-comment.
	587
	588	For C<start> events, this contains the original tag name followed by
	589	the attribute name/value pairs. The value of boolean attributes will
	590	be either the value set by $p->boolean_attribute_value or the
	591	attribute name if no value has been set by
	592	$p->boolean_attribute_value.
	593
	594	For C<end> events, this contains the original tag name (always one token).
	595
	596	For C<process> events, this contains the process instructions (always one
	597	token).
	598
	599	This passes C<undef> for C<text> events.
	600
	601	=item C<tokenpos>
	602
	603	Tokenpos causes a reference to an array of token positions to be
	604	passed. For each string that appears in C<tokens>, this array
	605	contains two numbers. The first number is the offset of the start of
	606	the token in the original C<text> and the second number is the length
	607	of the token.
	608
	609	Boolean attributes in a C<start> event will have (0,0) for the
	610	attribute value offset and length.
	611
	612	This passes undef if there are no tokens in the event (e.g., C<text>)
	613	and for artifical C<end> events triggered by empty element tags.
	614
	615	If you are using these offsets and lengths to modify C<text>, you
	616	should either work from right to left, or be very careful to calculate
	617	the changes to the offsets.
	618
	619	=item C<token0>
	620
	621	Token0 causes the original text of the first token string to be
	622	passed. This should always be the same as $tokens->[0].
	623
	624	For C<declaration> events, this is the declaration type.
	625
	626	For C<start> and C<end> events, this is the tag name.
	627
	628	For C<process> and non-strict C<comment> events, this is everything
	629	inside the tag.
	630
	631	This passes undef if there are no tokens in the event.
	632
	633	=item C<tagname>
	634
	635	This is the element name (or I<generic identifier> in SGML jargon) for
	636	start and end tags. Since HTML is case insensitive this name is
	637	forced to lower case to ease string matching.
	638
	639	Since XML is case sensitive, the tagname case is not changed when
	640	C<xml_mode> is enabled. Same happens if the C<case_sensitive> attribute
	641	is set.
	642
	643	The declaration type of declaration elements is also passed as a tagname,
	644	even if that is a bit strange.
	645	In fact, in the current implementation tagname is
	646	identical to C<token0> except that the name may be forced to lower case.
	647
	648	=item C<tag>
	649
	650	Same as C<tagname>, but prefixed with "/" if it belongs to an C<end>
	651	event and "!" for a declaration. The C<tag> does not have any prefix
	652	for C<start> events, and is in this case identical to C<tagname>.
	653
	654	=item C<attr>
	655
	656	Attr causes a reference to a hash of attribute name/value pairs to be
	657	passed.
	658
	659	Boolean attributes' values are either the value set by
	660	$p->boolean_attribute_value or the attribute name if no value has been
	661	set by $p->boolean_attribute_value.
	662
	663	This passes undef except for C<start> events.
	664
	665	Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute
	666	names are forced to lower case.
	667
	668	General entities are decoded in the attribute values and
	669	one layer of matching quotes enclosing the attribute values are removed.
	670
	671	=item C<attrseq>
	672
	673	Attrseq causes a reference to an array of attribute names to be
	674	passed. This can be useful if you want to walk the C<attr> hash in
	675	the original sequence.
	676
	677	This passes undef except for C<start> events.
	678
	679	Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute
	680	names are forced to lower case.
	681
	682	=item C<@attr>
	683
	684	Basically same as C<attr>, but keys and values are passed as
	685	individual arguments and the original sequence of the attributes is
	686	kept. The parameters passed will be the same as the @attr calculated
	687	here:
	688
	689	@attr = map { $_ => $attr->{$_} } @$attrseq;
	690
	691	assuming $attr and $attrseq here are the hash and array passed as the
	692	result of C<attr> and C<attrseq> argspecs.
	693
	694	This pass no values for events besides C<start>.
	695
	696	=item C<text>
	697
	698	Text causes the source text (including markup element delimiters) to be
	699	passed.
	700
	701	=item C<dtext>
	702
	703	Dtext causes the decoded text to be passed. General entities are
	704	automatically decoded unless the event was inside a CDATA section or
	705	was between literal start and end tags (C<script>, C<style>, C<xmp>,
	706	and C<plaintext>).
	707
	708	The Unicode character set is assumed for entity decoding. With perl
	709	version < 5.7.1 only the Latin1 range is supported, and entities for
	710	characters outside the 0..255 range is left unchanged.
	711
	712	This passes undef except for C<text> events.
	713
	714	=item C<is_cdata>
	715
	716	Is_cdata causes a TRUE value to be passed if the event is inside a CDATA
	717	section or is between literal start and end tags (C<script>,
	718	C<style>, C<xmp>, and C<plaintext>).
	719
	720	When the flag is FALSE for a text event, then you should normally
	721	either use C<dtext> or decode the entities yourself before the text is
	722	processed further.
	723
	724	=item C<skipped_text>
	725
	726	Skipped_text returns the concatenated text of all the events that has
	727	been skipped since the last time an event was reported. Events might
	728	be skipped because no handler is registered for them or because some
	729	filter applies. Skipped text also include marked section markup,
	730	since there is no events that can catch them.
	731
	732	If an C<"">-handler is registered for an event, then the text for this
	733	event is not included in C<skipped_text>. Skipped text both before
	734	and after the C<"">-event is included in the next reported
	735	C<skipped_text>.
	736
	737	=item C<offset>
	738
	739	Offset causes the byte position in the HTML document of the start of
	740	the event to be passed. The first byte in the document is 0.
	741
	742	=item C<length>
	743
	744	Length causes the number of bytes of the source text of the event to
	745	be passed.
	746
	747	=item C<offset_end>
	748
	749	Offset_end causes the byte position in the HTML document of the end of
	750	the event to be passed. This is the same as C<offset> + C<length>.
	751
	752	=item C<event>
	753
	754	Event causes the event name to be passed.
	755
	756	The event name is one of C<text>, C<start>, C<end>, C<declaration>,
	757	C<comment>, C<process>, C<start_document>, C<end_document> or C<default>.
	758
	759	=item C<line>
	760
	761	Line causes the line number of the start of the event to be passed.
	762	The first line in the document is 1. Line counting doesn't start
	763	until at least one handler requests this value to be reported.
	764
	765	=item C<column>
	766
	767	Column causes the column number of the start of the event to be passed.
	768	The first column on a line is 0.
	769
	770	=item C<'...'>
	771
	772	A literal string of 0 to 255 characters enclosed
	773	in single (') or double (") quotes is passed as entered.
	774
	775	=item C<undef>
	776
	777	Pass an undefined value. Useful as padding where the same handler
	778	routine is registered for multiple events.
	779
	780	=back
	781
	782	The whole argspec string can be wrapped up in C<'@{...}'> to signal
	783	that resulting event array should be flatten. This only makes a
	784	difference if an array reference is used as the handler target.
	785	Consider this example:
	786
	787	$p->handler(text => [], 'text');
	788	$p->handler(text => [], '@{text}']);
	789
	790	With two text events; C<"foo">, C<"bar">; then the first one will end
	791	up with [["foo"], ["bar"]] and the second one with ["foo", "bar"] in
	792	the handler target array.
	793
	794
	795	=head2 Events
	796
	797	Handlers for the following events can be registered:
	798
	799	=over
	800
	801	=item C<text>
	802
	803	This event is triggered when plain text (characters) is recognized.
	804	The text may contain multiple lines. A sequence of text may be broken
	805	between several text events unless $p->unbroken_text is enabled.
	806
	807	The parser will make sure that it does not break a word or a sequence
	808	of whitespace between two text events.
	809
	810	=item C<start>
	811
	812	This event is triggered when a start tag is recognized.
	813
	814	Example:
	815
	816	<A HREF="http://www.perl.com/">
	817
	818	=item C<end>
	819
	820	This event is triggered when an end tag is recognized.
	821
	822	Example:
	823
	824	</A>
	825
	826	=item C<declaration>
	827
	828	This event is triggered when a I<markup declaration> is recognized.
	829
	830	For typical HTML documents, the only declaration you are
	831	likely to find is <!DOCTYPE ...>.
	832
	833	Example:
	834
	835	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
	836	"http://www.w3.org/TR/html40/strict.dtd">
	837
	838	DTDs inside <!DOCTYPE ...> will confuse HTML::Parser.
	839
	840	=item C<comment>
	841
	842	This event is triggered when a markup comment is recognized.
	843
	844	Example:
	845
	846	<!-- This is a comment -- -- So is this -->
	847
	848	=item C<process>
	849
	850	This event is triggered when a processing instructions markup is
	851	recognized.
	852
	853	The format and content of processing instructions is system and
	854	application dependent.
	855
	856	Examples:
	857
	858	<? HTML processing instructions >
	859	<? XML processing instructions ?>
	860
	861	=item C<start_document>
	862
	863	This event is triggered before any other events for a new document. A
	864	handler for it can be used to initialize stuff. There is no document
	865	text associated with this event.
	866
	867	=item C<end_document>
	868
	869	This event is triggered when $p->eof called and after any remaining
	870	text is flushed. There is no document text associated with this event.
	871
	872	=item C<default>
	873
	874	This event is triggered for events that do not have a specific
	875	handler. You can set up a handler for this event to catch stuff you
	876	did not want to catch explicitly.
	877
	878	=back
	879
	880	=head1 VERSION 2 COMPATIBILITY
	881
	882	When an C<HTML::Parser> object is constructed with no arguments, a set
	883	of handlers is automatically provided that is compatible with the old
	884	HTML::Parser version 2 callback methods.
	885
	886	This is equivalent to the following method calls:
	887
	888	$p->handler(start => "start", "self, tagname, attr, attrseq, text");
	889	$p->handler(end => "end", "self, tagname, text");
	890	$p->handler(text => "text", "self, text, is_cdata");
	891	$p->handler(process => "process", "self, token0, text");
	892	$p->handler(comment =>
	893	sub {
	894	my($self, $tokens) = @_;
	895	for (@$tokens) {$self->comment($_);}},
	896	"self, tokens");
	897	$p->handler(declaration =>
	898	sub {
	899	my $self = shift;
	900	$self->declaration(substr($_[0], 2, -1));},
	901	"self, text");
	902
	903	Setup of these handlers can also be requested with the "api_version =>
	904	2" constructor option.
	905
	906	=head1 SUBCLASSING
	907
	908	The C<HTML::Parser> class is subclassable. Parser objects are plain
	909	hashes and C<HTML::Parser> reserves only hash keys that start with
	910	"_hparser". The parser state can be set up by invoking the init()
	911	method which takes the same arguments as new().
	912
	913	=head1 EXAMPLES
	914
	915	The first simple example shows how you might strip out comments from
	916	an HTML document. We achieve this by setting up a comment handler that
	917	does nothing and a default handler that will print out anything else:
	918
	919	use HTML::Parser;
	920	HTML::Parser->new(default_h => [sub { print shift }, 'text'],
	921	comment_h => [""],
	922	)->parse_file(shift \|\| die) \|\| die $!;
	923
	924	An alternative implementation is:
	925
	926	use HTML::Parser;
	927	HTML::Parser->new(end_document_h => [sub { print shift },
	928	'skipped_text'],
	929	comment_h => [""],
	930	)->parse_file(shift \|\| die) \|\| die $!;
	931
	932	This will in most cases be much more efficient since only a single
	933	callback will be made.
	934
	935	The next example prints out the text that is inside the <title>
	936	element of an HTML document. Here we start by setting up a start
	937	handler. When it sees the title start tag it enables a text handler
	938	that prints any text found and an end handler that will terminate
	939	parsing as soon as the title end tag is seen:
	940
	941	use HTML::Parser ();
	942
	943	sub start_handler
	944	{
	945	return if shift ne "title";
	946	my $self = shift;
	947	$self->handler(text => sub { print shift }, "dtext");
	948	$self->handler(end => sub { shift->eof if shift eq "title"; },
	949	"tagname,self");
	950	}
	951
	952	my $p = HTML::Parser->new(api_version => 3);
	953	$p->handler( start => \&start_handler, "tagname,self");
	954	$p->parse_file(shift \|\| die) \|\| die $!;
	955	print "\n";
	956
	957	More examples are found in the "eg/" directory of the C<HTML-Parser>
	958	distribution; the program C<hrefsub> shows how you can edit all links
	959	found in a document and C<htextsub> how to edid the text only; the
	960	program C<hstrip> shows how you can strip out certain tags/elements
	961	and/or attributes; and the program C<htext> show how to obtain the
	962	plain text, but not any script/style content.
	963
	964	=head1 BUGS
	965
	966	The <style> and <script> sections do not end with the first "</", but
	967	need the complete corresponding end tag.
	968
	969	When the I<strict_comment> option is enabled, we still recognize
	970	comments where there is something other than whitespace between even
	971	and odd "--" markers.
	972
	973	Once $p->boolean_attribute_value has been set, there is no way to
	974	restore the default behaviour.
	975
	976	There is currently no way to get both quote characters
	977	into the same literal argspec.
	978
	979	Empty tags, e.g. "<>" and "</>", are not recognized. SGML allows them
	980	to repeat the previous start tag or close the previous start tag
	981	respecitvely.
	982
	983	NET tags, e.g. "code/.../" are not recognized. This is an SGML
	984	shorthand for "<code>...</code>".
	985
	986	Unclosed start or end tags, e.g. "<tt<b>...</b</tt>" are not
	987	recognized.
	988
	989	=head1 DIAGNOSTICS
	990
	991	The following messages may be produced by HTML::Parser. The notation
	992	in this listing is the same as used in L<perldiag>:
	993
	994	=over
	995
	996	=item Not a reference to a hash
	997
	998	(F) The object blessed into or subclassed from HTML::Parser is not a
	999	hash as required by the HTML::Parser methods.
	1000
	1001	=item Bad signature in parser state object at %p
	1002
	1003	(F) The _hparser_xs_state element does not refer to a valid state structure.
	1004	Something must have changed the internal value
	1005	stored in this hash element, or the memory has been overwritten.
	1006
	1007	=item _hparser_xs_state element is not a reference
	1008
	1009	(F) The _hparser_xs_state element has been destroyed.
	1010
	1011	=item Can't find '_hparser_xs_state' element in HTML::Parser hash
	1012
	1013	(F) The _hparser_xs_state element is missing from the parser hash.
	1014	It was either deleted, or not created when the object was created.
	1015
	1016	=item API version %s not supported by HTML::Parser %s
	1017
	1018	(F) The constructor option 'api_version' with an argument greater than
	1019	or equal to 4 is reserved for future extentions.
	1020
	1021	=item Bad constructor option '%s'
	1022
	1023	(F) An unknown constructor option key was passed to the new() or
	1024	init() methods.
	1025
	1026	=item Parse loop not allowed
	1027
	1028	(F) A handler invoked the parse() or parse_file() method.
	1029	This is not permitted.
	1030
	1031	=item marked sections not supported
	1032
	1033	(F) The $p->marked_sections() method was invoked in a HTML::Parser
	1034	module that was compiled without support for marked sections.
	1035
	1036	=item Unknown boolean attribute (%d)
	1037
	1038	(F) Something is wrong with the internal logic that set up aliases for
	1039	boolean attributes.
	1040
	1041	=item Only code or array references allowed as handler
	1042
	1043	(F) The second argument for $p->handler must be either a subroutine
	1044	reference, then name of a subroutine or method, or a reference to an
	1045	array.
	1046
	1047	=item No handler for %s events
	1048
	1049	(F) The first argument to $p->handler must be a valid event name; i.e. one
	1050	of "start", "end", "text", "process", "declaration" or "comment".
	1051
	1052	=item Unrecognized identifier %s in argspec
	1053
	1054	(F) The identifier is not a known argspec name.
	1055	Use one of the names mentioned in the argspec section above.
	1056
	1057	=item Literal string is longer than 255 chars in argspec
	1058
	1059	(F) The current implementation limits the length of literals in
	1060	an argspec to 255 characters. Make the literal shorter.
	1061
	1062	=item Backslash reserved for literal string in argspec
	1063
	1064	(F) The backslash character "\" is not allowed in argspec literals.
	1065	It is reserved to permit quoting inside a literal in a later version.
	1066
	1067	=item Unterminated literal string in argspec
	1068
	1069	(F) The terminating quote character for a literal was not found.
	1070
	1071	=item Bad argspec (%s)
	1072
	1073	(F) Only identifier names, literals, spaces and commas
	1074	are allowed in argspecs.
	1075
	1076	=item Missing comma separator in argspec
	1077
	1078	(F) Identifiers in an argspec must be separated with ",".
	1079
	1080	=back
	1081
	1082	=head1 SEE ALSO
	1083
	1084	L<HTML::Entities>, L<HTML::PullParser>, L<HTML::TokeParser>, L<HTML::HeadParser>,
	1085	L<HTML::LinkExtor>, L<HTML::Form>
	1086
	1087	L<HTML::TreeBuilder> (part of the I<HTML-Tree> distribution)
	1088
	1089	http://www.w3.org/TR/REC-html40
	1090
	1091	More information about marked sections and processing instructions may
	1092	be found at C<http://www.sgml.u-net.com/book/sgml-8.htm>.
	1093
	1094	=head1 COPYRIGHT
	1095
	1096	Copyright 1996-2003 Gisle Aas. All rights reserved.
	1097	Copyright 1999-2000 Michael A. Chase. All rights reserved.
	1098
	1099	This library is free software; you can redistribute it and/or
	1100	modify it under the same terms as Perl itself.
	1101
	1102	=cut