Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / site_perl / 5.8.0 / sun4-solaris / HTML / Parser.pm
CommitLineData
86530b38
AT
1package HTML::Parser;
2
3# Copyright 1996-2003, Gisle Aas.
4# Copyright 1999-2000, Michael A. Chase.
5#
6# This library is free software; you can redistribute it and/or
7# modify it under the same terms as Perl itself.
8
9use strict;
10use vars qw($VERSION @ISA);
11
12$VERSION = '3.28'; # $Date: 2003/04/17 03:45:34 $
13
14require HTML::Entities;
15
16require DynaLoader;
17@ISA=qw(DynaLoader);
18HTML::Parser->bootstrap($VERSION);
19
20
21sub new
22{
23 my $class = shift;
24 my $self = bless {}, $class;
25 return $self->init(@_);
26}
27
28
29sub init
30{
31 my $self = shift;
32 $self->_alloc_pstate;
33
34 my %arg = @_;
35 my $api_version = delete $arg{api_version} || (@_ ? 3 : 2);
36 if ($api_version >= 4) {
37 require Carp;
38 Carp::croak("API version $api_version not supported " .
39 "by HTML::Parser $VERSION");
40 }
41
42 if ($api_version < 3) {
43 # Set up method callbacks compatible with HTML-Parser-2.xx
44 $self->handler(text => "text", "self,text,is_cdata");
45 $self->handler(end => "end", "self,tagname,text");
46 $self->handler(process => "process", "self,token0,text");
47 $self->handler(start => "start",
48 "self,tagname,attr,attrseq,text");
49
50 $self->handler(comment =>
51 sub {
52 my($self, $tokens) = @_;
53 for (@$tokens) {
54 $self->comment($_);
55 }
56 }, "self,tokens");
57
58 $self->handler(declaration =>
59 sub {
60 my $self = shift;
61 $self->declaration(substr($_[0], 2, -1));
62 }, "self,text");
63 }
64
65 if (my $h = delete $arg{handlers}) {
66 $h = {@$h} if ref($h) eq "ARRAY";
67 while (my($event, $cb) = each %$h) {
68 $self->handler($event => @$cb);
69 }
70 }
71
72 # In the end we try to assume plain attribute or handler
73 while (my($option, $val) = each %arg) {
74 if ($option =~ /^(\w+)_h$/) {
75 $self->handler($1 => @$val);
76 }
77 elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) {
78 require Carp;
79 Carp::croak("Bad constructor option '$option'");
80 }
81 else {
82 $self->$option($val);
83 }
84 }
85
86 return $self;
87}
88
89
90sub parse_file
91{
92 my($self, $file) = @_;
93 my $opened;
94 if (!ref($file) && ref(\$file) ne "GLOB") {
95 # Assume $file is a filename
96 local(*F);
97 open(F, $file) || return undef;
98 binmode(F); # should we? good for byte counts
99 $opened++;
100 $file = *F;
101 }
102 my $chunk = '';
103 while (read($file, $chunk, 512)) {
104 $self->parse($chunk) || last;
105 }
106 close($file) if $opened;
107 $self->eof;
108}
109
110
111sub netscape_buggy_comment # legacy
112{
113 my $self = shift;
114 require Carp;
115 Carp::carp("netscape_buggy_comment() is deprecated. " .
116 "Please use the strict_comment() method instead");
117 my $old = !$self->strict_comment;
118 $self->strict_comment(!shift) if @_;
119 return $old;
120}
121
122# set up method stubs
123sub text { }
124*start = \&text;
125*end = \&text;
126*comment = \&text;
127*declaration = \&text;
128*process = \&text;
129
1301;
131
132__END__
133
134
135=head1 NAME
136
137HTML::Parser - HTML parser class
138
139=head1 SYNOPSIS
140
141 use HTML::Parser ();
142
143 # Create parser object
144 $p = HTML::Parser->new( api_version => 3,
145 start_h => [\&start, "tagname, attr"],
146 end_h => [\&end, "tagname"],
147 marked_sections => 1,
148 );
149
150 # Parse document text chunk by chunk
151 $p->parse($chunk1);
152 $p->parse($chunk2);
153 #...
154 $p->eof; # signal end of document
155
156 # Parse directly from file
157 $p->parse_file("foo.html");
158 # or
159 open(F, "foo.html") || die;
160 $p->parse_file(*F);
161
162HTML::Parser version 2 style subclassing and method callbacks:
163
164 {
165 package MyParser;
166 use base 'HTML::Parser';
167
168 sub start {
169 my($self, $tagname, $attr, $attrseq, $origtext) = @_;
170 #...
171 }
172
173 sub end {
174 my($self, $tagname, $origtext) = @_;
175 #...
176 }
177
178 sub text {
179 my($self, $origtext, $is_cdata) = @_;
180 #...
181 }
182 }
183
184 my $p = MyParser->new;
185 $p->parse_file("foo.html");
186
187=head1 DESCRIPTION
188
189Objects of the C<HTML::Parser> class will recognize markup and
190separate it from plain text (alias data content) in HTML
191documents. As different kinds of markup and text are recognized, the
192corresponding event handlers are invoked.
193
194C<HTML::Parser> in not a generic SGML parser. We have tried to
195make it able to deal with the HTML that is actually "out there", and
196it normally parses as closely as possible to the way the popular web
197browsers do it instead of strictly following one of the many HTML
198specifications from W3C. Where there is disagreement there is often
199an option that you can enable to get the official behaviour.
200
201The document to be parsed may be supplied in arbitrary chunks. This
202makes on-the-fly parsing as documents are received from the network
203possible.
204
205If event driven parsing does not feel right for your application, you
206might want to use C<HTML::PullParser>. It is a
207C<HTML::Parser> subclass that allows a more conventional program
208structure.
209
210
211=head1 METHODS
212
213The following method is used to construct a new C<HTML::Parser> object:
214
215=over
216
217=item $p = HTML::Parser->new( %options_and_handlers )
218
219This class method creates a new C<HTML::Parser> object and
220returns it. Key/value pair arguments may be provided to assign event
221handlers or initialize parser options. The handlers and parser
222options can also be set or modified later by method calls described below.
223
224If a top level key is in the form "<event>_h" (e.g., "text_h"} then it
225assigns a handler to that event, otherwise it initializes a parser
226option. The event handler specification value must be an array
227reference. Multiple handlers may also be assigned with the 'handlers
228=> [%handlers]' option. See examples below.
229
230If new() is called without any arguments, it will create a parser that
231uses callback methods compatible with version 2 of C<HTML::Parser>.
232See the section on "version 2 compatibility" below for details.
233
234Special constructor option 'api_version => 2' can be used to
235initialize version 2 callbacks while still setting other options and
236handlers. The 'api_version => 3' option can be used if you don't want
237to set any options and don't want to fall back to v2 compatible
238mode.
239
240Examples:
241
242 $p = HTML::Parser->new(api_version => 3,
243 text_h => [ sub {...}, "dtext" ]);
244
245This creates a new parser object with a text event handler subroutine
246that receives the original text with general entities decoded.
247
248 $p = HTML::Parser->new(api_version => 3,
249 start_h => [ 'my_start', "self,tokens" ]);
250
251This creates a new parser object with a start event handler method
252that receives the $p and the tokens array.
253
254 $p = HTML::Parser->new(api_version => 3,
255 handlers => { text => [\@array, "event,text"],
256 comment => [\@array, "event,text"],
257 });
258
259This creates a new parser object that stores the event type and the
260original text in @array for text and comment events.
261
262=back
263
264The following methods feed the HTML document
265to the C<HTML::Parser> object:
266
267=over
268
269=item $p->parse( $string )
270
271Parse $string as the next chunk of the HTML document. The return
272value is normally a reference to the parser object (i.e. $p).
273Handlers invoked should not attempt modify the $string in-place until
274$p->parse returns.
275
276If an invoked event handler aborts parsing by calling $p->eof, then
277$p->parse() will return a FALSE value.
278
279=item $p->parse( $code_ref )
280
281If a code reference is passed in as the argument to parse then the
282chunks to parse is obtained by invoking this function repeatedly.
283Parsing continues until the function returns an empty (or undefined)
284result. When this happens $p->eof is automatically signalled.
285
286Parsing will also abort if one of the event handlers call $p->eof.
287
288The effect of this is the same as:
289
290 while (1) {
291 my $chunk = &$code_ref();
292 if (!defined($chunk) || !length($chunk)) {
293 $p->eof;
294 return $p;
295 }
296 $p->parse($chunk) || return undef;
297 }
298
299But it is more efficient as this loop runs internally in XS code.
300
301=item $p->parse_file( $file )
302
303Parse text directly from a file. The $file argument can be a
304filename, an open file handle, or a reference to a an open file
305handle.
306
307If $file contains a filename and the file can't be opened, then the
308method returns an undefined value and $! tells why it failed.
309Otherwise the return value is a reference to the parser object.
310
311If a file handle is passed as the $file argument, then the file will
312normally be read until EOF, but not closed.
313
314If an invoked event handler aborts parsing by calling $p->eof,
315then $p->parse_file() may not have read the entire file.
316
317On systems with multi-byte line terminators, the values passed for the
318offset and length argspecs may be too low if parse_file() is called on
319a file handle that is not in binary mode.
320
321If a filename is passed in, then parse_file() will open the file in
322binary mode.
323
324=item $p->eof
325
326Signals the end of the HTML document. Calling the $p->eof method
327outside a handler callback will flush any remaining buffered text
328(which triggers the C<text> event if there is any remaining text).
329
330Calling $p->eof inside a handler will terminate parsing at that point
331and cause $p->parse to return a FALSE value. This also terminates
332parsing by $p->parse_file().
333
334After $p->eof has been called, the parse() and parse_file() methods
335can be invoked to feed new documents with the parser object.
336
337The return value from eof() is a reference to the parser object.
338
339=back
340
341
342Most parser options are controlled by boolean attributes.
343Each boolean attribute is enabled by calling the corresponding method
344with a TRUE argument and disabled with a FALSE argument. The
345attribute value is left unchanged if no argument is given. The return
346value from each method is the old attribute value.
347
348Methods that can be used to get and/or set parser options are:
349
350=over
351
352=item $p->strict_comment( [$bool] )
353
354By default, comments are terminated by the first occurrence of "-->".
355This is the behaviour of most popular browsers (like Netscape and
356MSIE), but it is not correct according to the official HTML
357standard. Officially, you need an even number of "--" tokens before
358the closing ">" is recognized and there may not be anything but
359whitespace between an even and an odd "--".
360
361The official behaviour is enabled by enabling this attribute.
362
363=item $p->strict_names( [$bool] )
364
365By default, almost anything is allowed in tag and attribute names.
366This is the behaviour of most popular browsers and allows us to parse
367some broken tags with invalid attr values like:
368
369 <IMG SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0>
370
371By default, "LIST]" is parsed as a boolean attribute, not as
372part of the ALT value as was clearly intended. This is also what
373Netscape sees.
374
375The official behaviour is enabled by enabling this attribute. If
376enabled, it will cause the tag above to be reported as text
377since "LIST]" is not a legal attribute name.
378
379=item $p->boolean_attribute_value( $val )
380
381This method sets the value reported for boolean attributes inside HTML
382start tags. By default, the name of the attribute is also used as its
383value. This affects the values reported for C<tokens> and C<attr>
384argspecs.
385
386=item $p->xml_mode( [$bool] )
387
388Enabling this attribute changes the parser to allow some XML
389constructs such as I<empty element tags> and I<XML processing
390instructions>. It disables forcing tag and attribute names to lower
391case when they are reported by the C<tagname> and C<attr> argspecs,
392and suppress special treatment of elements that are parsed as CDATA
393for HTML.
394
395I<Empty element tags> look like start tags, but end with the character
396sequence "/>". When recognized by C<HTML::Parser> they cause an
397artificial end event in addition to the start event. The C<text> for
398the artificial end event will be empty and the C<tokenpos> array will
399be undefined even though the only element in the token array will have
400the correct tag name.
401
402I<XML processing instructions> are terminated by "?>" instead of a
403simple ">" as is the case for HTML.
404
405=item $p->unbroken_text( [$bool] )
406
407By default, blocks of text are given to the text handler as soon as
408possible (but the parser makes sure to always break text at the
409boundary between whitespace and non-whitespace so single words and
410entities always can be decoded safely). This might create breaks that
411make it hard to do transformations on the text. When this attribute is
412enabled, blocks of text are always reported in one piece. This will
413delay the text event until the following (non-text) event has been
414recognized by the parser.
415
416Note that the C<offset> argspec will give you the offset of the first
417segment of text and C<length> is the combined length of the segments.
418Since there might be ignored tags in between, these numbers can't be
419used to directly index in the original document file.
420
421=item $p->marked_sections( [$bool] )
422
423By default, section markings like <![CDATA[...]]> are treated like
424ordinary text. When this attribute is enabled section markings are
425honoured.
426
427There are currently no events associated with the marked section
428markup, but the text can be returned as C<skipped_text>.
429
430=item $p->attr_encoded( [$bool] )
431
432By default, the C<attr> and C<@attr> argspecs will have general
433entities for attribute values decoded. Enabling this attribute leaves
434entities alone.
435
436=item $p->case_sensitive( [$bool] )
437
438By default, tagnames and attribute names are down-cased. Enabling this
439attribute leave them as found in the HTML source document.
440
441=back
442
443As markup and text is recognized, handlers are invoked. The following
444method is used to set up handlers for different events:
445
446=over
447
448=item $p->handler( event => \&subroutine, argspec )
449
450=item $p->handler( event => method_name, argspec )
451
452=item $p->handler( event => \@accum, argspec )
453
454=item $p->handler( event => "" );
455
456=item $p->handler( event => undef );
457
458=item $p->handler( event );
459
460This method assigns a subroutine, method, or array to handle an event.
461
462Event is one of C<text>, C<start>, C<end>, C<declaration>, C<comment>,
463C<process>, C<start_document>, C<end_document> or C<default>.
464
465I<Subroutine> is a reference to a subroutine which is called to handle
466the event.
467
468I<Method_name> is the name of a method of $p which is called to handle
469the event.
470
471I<Accum> is a array that will hold the event information as
472sub-arrays.
473
474If the second argument is "", the event is ignored.
475If it is undef, the default handler is invoked for the event.
476
477I<Argspec> is a string that describes the information to be reported
478for the event. Any requested information that does not apply to a
479specific event is passed as C<undef>. If argspec is omitted, then it
480is left unchanged since last update.
481
482The return value from $p->handle is the old callback routine or a
483reference to the accumulator array.
484
485Any return values from handler callback routines/methods are always
486ignored. A handler callback can request parsing to be aborted by
487invoking the $p->eof method. A handler callback is not allowed to
488invoke the $p->parse() or $p->parse_file() method. An exception will
489be raised if it tries.
490
491Examples:
492
493 $p->handler(start => "start", 'self, attr, attrseq, text' );
494
495This causes the "start" method of object $p to be called for 'start' events.
496The callback signature is $p->start(\%attr, \@attr_seq, $text).
497
498 $p->handler(start => \&start, 'attr, attrseq, text' );
499
500This causes subroutine start() to be called for 'start' events.
501The callback signature is start(\%attr, \@attr_seq, $text).
502
503 $p->handler(start => \@accum, '"S", attr, attrseq, text' );
504
505This causes 'start' event information to be saved in @accum.
506The array elements will be ['S', \%attr, \@attr_seq, $text].
507
508 $p->handler(start => "");
509
510This causes 'start' events to be ignored. It also supresses
511invokations of any default handler for start events. It is in most
512cases equivalent to $p->handler(start => sub {}), but is more
513efficient. It is different from the empty-sub-handler in that
514C<skipped_text> is not reset by it.
515
516 $p->handler(start => undef);
517
518This causes no handler to be assosiated with start events.
519If there is a default handler it will be invoked.
520
521=back
522
523Filters based on tags can be set up to limit the number of events
524reported. The main bottleneck during parsing is often the huge number
525of callbacks made from the parser. Applying filters can improve
526performance significantly.
527
528The following methods control filters:
529
530=over
531
532=item $p->ignore_tags( TAG, ... )
533
534Any C<start> and C<end> events involving any of the tags given are
535suppressed.
536
537=item $p->report_tags( TAG, ... )
538
539Any C<start> and C<end> events involving any of the tags I<not> given
540are suppressed.
541
542=item $p->ignore_elements( TAG, ... )
543
544Both the C<start> and the C<end> event as well as any events that
545would be reported in between are suppressed. The ignored elements can
546contain nested occurences of itself. Example:
547
548 $p->ignore_elements(qw(script style));
549
550The C<script> and C<style> tags will always nest properly since their
551content is parsed in CDATA mode. For most other tags
552C<ignore_elements> must be used with caution since HTML is often not
553I<well formed>.
554
555=back
556
557=head2 Argspec
558
559Argspec is a string containing a comma separated list that describes
560the information reported by the event. The following argspec
561identifier names can be used:
562
563=over
564
565=item C<self>
566
567Self causes the current object to be passed to the handler. If the
568handler is a method, this must be the first element in the argspec.
569
570An alternative to passing self as an argspec is to register closures
571that capture $self by themselves as handlers. Unfortunately this
572creates a circular references which prevents the HTML::Parser object
573from being garbage collected. Using the C<self> argspec avoids this
574problem.
575
576=item C<tokens>
577
578Tokens causes a reference to an array of token strings to be passed.
579The strings are exactly as they were found in the original text,
580no decoding or case changes are applied.
581
582For C<declaration> events, the array contains each word, comment, and
583delimited string starting with the declaration type.
584
585For C<comment> events, this contains each sub-comment. If
586$p->strict_comments is disabled, there will be only one sub-comment.
587
588For C<start> events, this contains the original tag name followed by
589the attribute name/value pairs. The value of boolean attributes will
590be either the value set by $p->boolean_attribute_value or the
591attribute name if no value has been set by
592$p->boolean_attribute_value.
593
594For C<end> events, this contains the original tag name (always one token).
595
596For C<process> events, this contains the process instructions (always one
597token).
598
599This passes C<undef> for C<text> events.
600
601=item C<tokenpos>
602
603Tokenpos causes a reference to an array of token positions to be
604passed. For each string that appears in C<tokens>, this array
605contains two numbers. The first number is the offset of the start of
606the token in the original C<text> and the second number is the length
607of the token.
608
609Boolean attributes in a C<start> event will have (0,0) for the
610attribute value offset and length.
611
612This passes undef if there are no tokens in the event (e.g., C<text>)
613and for artifical C<end> events triggered by empty element tags.
614
615If you are using these offsets and lengths to modify C<text>, you
616should either work from right to left, or be very careful to calculate
617the changes to the offsets.
618
619=item C<token0>
620
621Token0 causes the original text of the first token string to be
622passed. This should always be the same as $tokens->[0].
623
624For C<declaration> events, this is the declaration type.
625
626For C<start> and C<end> events, this is the tag name.
627
628For C<process> and non-strict C<comment> events, this is everything
629inside the tag.
630
631This passes undef if there are no tokens in the event.
632
633=item C<tagname>
634
635This is the element name (or I<generic identifier> in SGML jargon) for
636start and end tags. Since HTML is case insensitive this name is
637forced to lower case to ease string matching.
638
639Since XML is case sensitive, the tagname case is not changed when
640C<xml_mode> is enabled. Same happens if the C<case_sensitive> attribute
641is set.
642
643The declaration type of declaration elements is also passed as a tagname,
644even if that is a bit strange.
645In fact, in the current implementation tagname is
646identical to C<token0> except that the name may be forced to lower case.
647
648=item C<tag>
649
650Same as C<tagname>, but prefixed with "/" if it belongs to an C<end>
651event and "!" for a declaration. The C<tag> does not have any prefix
652for C<start> events, and is in this case identical to C<tagname>.
653
654=item C<attr>
655
656Attr causes a reference to a hash of attribute name/value pairs to be
657passed.
658
659Boolean attributes' values are either the value set by
660$p->boolean_attribute_value or the attribute name if no value has been
661set by $p->boolean_attribute_value.
662
663This passes undef except for C<start> events.
664
665Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute
666names are forced to lower case.
667
668General entities are decoded in the attribute values and
669one layer of matching quotes enclosing the attribute values are removed.
670
671=item C<attrseq>
672
673Attrseq causes a reference to an array of attribute names to be
674passed. This can be useful if you want to walk the C<attr> hash in
675the original sequence.
676
677This passes undef except for C<start> events.
678
679Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute
680names are forced to lower case.
681
682=item C<@attr>
683
684Basically same as C<attr>, but keys and values are passed as
685individual arguments and the original sequence of the attributes is
686kept. The parameters passed will be the same as the @attr calculated
687here:
688
689 @attr = map { $_ => $attr->{$_} } @$attrseq;
690
691assuming $attr and $attrseq here are the hash and array passed as the
692result of C<attr> and C<attrseq> argspecs.
693
694This pass no values for events besides C<start>.
695
696=item C<text>
697
698Text causes the source text (including markup element delimiters) to be
699passed.
700
701=item C<dtext>
702
703Dtext causes the decoded text to be passed. General entities are
704automatically decoded unless the event was inside a CDATA section or
705was between literal start and end tags (C<script>, C<style>, C<xmp>,
706and C<plaintext>).
707
708The Unicode character set is assumed for entity decoding. With perl
709version < 5.7.1 only the Latin1 range is supported, and entities for
710characters outside the 0..255 range is left unchanged.
711
712This passes undef except for C<text> events.
713
714=item C<is_cdata>
715
716Is_cdata causes a TRUE value to be passed if the event is inside a CDATA
717section or is between literal start and end tags (C<script>,
718C<style>, C<xmp>, and C<plaintext>).
719
720When the flag is FALSE for a text event, then you should normally
721either use C<dtext> or decode the entities yourself before the text is
722processed further.
723
724=item C<skipped_text>
725
726Skipped_text returns the concatenated text of all the events that has
727been skipped since the last time an event was reported. Events might
728be skipped because no handler is registered for them or because some
729filter applies. Skipped text also include marked section markup,
730since there is no events that can catch them.
731
732If an C<"">-handler is registered for an event, then the text for this
733event is not included in C<skipped_text>. Skipped text both before
734and after the C<"">-event is included in the next reported
735C<skipped_text>.
736
737=item C<offset>
738
739Offset causes the byte position in the HTML document of the start of
740the event to be passed. The first byte in the document is 0.
741
742=item C<length>
743
744Length causes the number of bytes of the source text of the event to
745be passed.
746
747=item C<offset_end>
748
749Offset_end causes the byte position in the HTML document of the end of
750the event to be passed. This is the same as C<offset> + C<length>.
751
752=item C<event>
753
754Event causes the event name to be passed.
755
756The event name is one of C<text>, C<start>, C<end>, C<declaration>,
757C<comment>, C<process>, C<start_document>, C<end_document> or C<default>.
758
759=item C<line>
760
761Line causes the line number of the start of the event to be passed.
762The first line in the document is 1. Line counting doesn't start
763until at least one handler requests this value to be reported.
764
765=item C<column>
766
767Column causes the column number of the start of the event to be passed.
768The first column on a line is 0.
769
770=item C<'...'>
771
772A literal string of 0 to 255 characters enclosed
773in single (') or double (") quotes is passed as entered.
774
775=item C<undef>
776
777Pass an undefined value. Useful as padding where the same handler
778routine is registered for multiple events.
779
780=back
781
782The whole argspec string can be wrapped up in C<'@{...}'> to signal
783that resulting event array should be flatten. This only makes a
784difference if an array reference is used as the handler target.
785Consider this example:
786
787 $p->handler(text => [], 'text');
788 $p->handler(text => [], '@{text}']);
789
790With two text events; C<"foo">, C<"bar">; then the first one will end
791up with [["foo"], ["bar"]] and the second one with ["foo", "bar"] in
792the handler target array.
793
794
795=head2 Events
796
797Handlers for the following events can be registered:
798
799=over
800
801=item C<text>
802
803This event is triggered when plain text (characters) is recognized.
804The text may contain multiple lines. A sequence of text may be broken
805between several text events unless $p->unbroken_text is enabled.
806
807The parser will make sure that it does not break a word or a sequence
808of whitespace between two text events.
809
810=item C<start>
811
812This event is triggered when a start tag is recognized.
813
814Example:
815
816 <A HREF="http://www.perl.com/">
817
818=item C<end>
819
820This event is triggered when an end tag is recognized.
821
822Example:
823
824 </A>
825
826=item C<declaration>
827
828This event is triggered when a I<markup declaration> is recognized.
829
830For typical HTML documents, the only declaration you are
831likely to find is <!DOCTYPE ...>.
832
833Example:
834
835 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
836 "http://www.w3.org/TR/html40/strict.dtd">
837
838DTDs inside <!DOCTYPE ...> will confuse HTML::Parser.
839
840=item C<comment>
841
842This event is triggered when a markup comment is recognized.
843
844Example:
845
846 <!-- This is a comment -- -- So is this -->
847
848=item C<process>
849
850This event is triggered when a processing instructions markup is
851recognized.
852
853The format and content of processing instructions is system and
854application dependent.
855
856Examples:
857
858 <? HTML processing instructions >
859 <? XML processing instructions ?>
860
861=item C<start_document>
862
863This event is triggered before any other events for a new document. A
864handler for it can be used to initialize stuff. There is no document
865text associated with this event.
866
867=item C<end_document>
868
869This event is triggered when $p->eof called and after any remaining
870text is flushed. There is no document text associated with this event.
871
872=item C<default>
873
874This event is triggered for events that do not have a specific
875handler. You can set up a handler for this event to catch stuff you
876did not want to catch explicitly.
877
878=back
879
880=head1 VERSION 2 COMPATIBILITY
881
882When an C<HTML::Parser> object is constructed with no arguments, a set
883of handlers is automatically provided that is compatible with the old
884HTML::Parser version 2 callback methods.
885
886This is equivalent to the following method calls:
887
888 $p->handler(start => "start", "self, tagname, attr, attrseq, text");
889 $p->handler(end => "end", "self, tagname, text");
890 $p->handler(text => "text", "self, text, is_cdata");
891 $p->handler(process => "process", "self, token0, text");
892 $p->handler(comment =>
893 sub {
894 my($self, $tokens) = @_;
895 for (@$tokens) {$self->comment($_);}},
896 "self, tokens");
897 $p->handler(declaration =>
898 sub {
899 my $self = shift;
900 $self->declaration(substr($_[0], 2, -1));},
901 "self, text");
902
903Setup of these handlers can also be requested with the "api_version =>
9042" constructor option.
905
906=head1 SUBCLASSING
907
908The C<HTML::Parser> class is subclassable. Parser objects are plain
909hashes and C<HTML::Parser> reserves only hash keys that start with
910"_hparser". The parser state can be set up by invoking the init()
911method which takes the same arguments as new().
912
913=head1 EXAMPLES
914
915The first simple example shows how you might strip out comments from
916an HTML document. We achieve this by setting up a comment handler that
917does nothing and a default handler that will print out anything else:
918
919 use HTML::Parser;
920 HTML::Parser->new(default_h => [sub { print shift }, 'text'],
921 comment_h => [""],
922 )->parse_file(shift || die) || die $!;
923
924An alternative implementation is:
925
926 use HTML::Parser;
927 HTML::Parser->new(end_document_h => [sub { print shift },
928 'skipped_text'],
929 comment_h => [""],
930 )->parse_file(shift || die) || die $!;
931
932This will in most cases be much more efficient since only a single
933callback will be made.
934
935The next example prints out the text that is inside the <title>
936element of an HTML document. Here we start by setting up a start
937handler. When it sees the title start tag it enables a text handler
938that prints any text found and an end handler that will terminate
939parsing as soon as the title end tag is seen:
940
941 use HTML::Parser ();
942
943 sub start_handler
944 {
945 return if shift ne "title";
946 my $self = shift;
947 $self->handler(text => sub { print shift }, "dtext");
948 $self->handler(end => sub { shift->eof if shift eq "title"; },
949 "tagname,self");
950 }
951
952 my $p = HTML::Parser->new(api_version => 3);
953 $p->handler( start => \&start_handler, "tagname,self");
954 $p->parse_file(shift || die) || die $!;
955 print "\n";
956
957More examples are found in the "eg/" directory of the C<HTML-Parser>
958distribution; the program C<hrefsub> shows how you can edit all links
959found in a document and C<htextsub> how to edid the text only; the
960program C<hstrip> shows how you can strip out certain tags/elements
961and/or attributes; and the program C<htext> show how to obtain the
962plain text, but not any script/style content.
963
964=head1 BUGS
965
966The <style> and <script> sections do not end with the first "</", but
967need the complete corresponding end tag.
968
969When the I<strict_comment> option is enabled, we still recognize
970comments where there is something other than whitespace between even
971and odd "--" markers.
972
973Once $p->boolean_attribute_value has been set, there is no way to
974restore the default behaviour.
975
976There is currently no way to get both quote characters
977into the same literal argspec.
978
979Empty tags, e.g. "<>" and "</>", are not recognized. SGML allows them
980to repeat the previous start tag or close the previous start tag
981respecitvely.
982
983NET tags, e.g. "code/.../" are not recognized. This is an SGML
984shorthand for "<code>...</code>".
985
986Unclosed start or end tags, e.g. "<tt<b>...</b</tt>" are not
987recognized.
988
989=head1 DIAGNOSTICS
990
991The following messages may be produced by HTML::Parser. The notation
992in this listing is the same as used in L<perldiag>:
993
994=over
995
996=item Not a reference to a hash
997
998(F) The object blessed into or subclassed from HTML::Parser is not a
999hash as required by the HTML::Parser methods.
1000
1001=item Bad signature in parser state object at %p
1002
1003(F) The _hparser_xs_state element does not refer to a valid state structure.
1004Something must have changed the internal value
1005stored in this hash element, or the memory has been overwritten.
1006
1007=item _hparser_xs_state element is not a reference
1008
1009(F) The _hparser_xs_state element has been destroyed.
1010
1011=item Can't find '_hparser_xs_state' element in HTML::Parser hash
1012
1013(F) The _hparser_xs_state element is missing from the parser hash.
1014It was either deleted, or not created when the object was created.
1015
1016=item API version %s not supported by HTML::Parser %s
1017
1018(F) The constructor option 'api_version' with an argument greater than
1019or equal to 4 is reserved for future extentions.
1020
1021=item Bad constructor option '%s'
1022
1023(F) An unknown constructor option key was passed to the new() or
1024init() methods.
1025
1026=item Parse loop not allowed
1027
1028(F) A handler invoked the parse() or parse_file() method.
1029This is not permitted.
1030
1031=item marked sections not supported
1032
1033(F) The $p->marked_sections() method was invoked in a HTML::Parser
1034module that was compiled without support for marked sections.
1035
1036=item Unknown boolean attribute (%d)
1037
1038(F) Something is wrong with the internal logic that set up aliases for
1039boolean attributes.
1040
1041=item Only code or array references allowed as handler
1042
1043(F) The second argument for $p->handler must be either a subroutine
1044reference, then name of a subroutine or method, or a reference to an
1045array.
1046
1047=item No handler for %s events
1048
1049(F) The first argument to $p->handler must be a valid event name; i.e. one
1050of "start", "end", "text", "process", "declaration" or "comment".
1051
1052=item Unrecognized identifier %s in argspec
1053
1054(F) The identifier is not a known argspec name.
1055Use one of the names mentioned in the argspec section above.
1056
1057=item Literal string is longer than 255 chars in argspec
1058
1059(F) The current implementation limits the length of literals in
1060an argspec to 255 characters. Make the literal shorter.
1061
1062=item Backslash reserved for literal string in argspec
1063
1064(F) The backslash character "\" is not allowed in argspec literals.
1065It is reserved to permit quoting inside a literal in a later version.
1066
1067=item Unterminated literal string in argspec
1068
1069(F) The terminating quote character for a literal was not found.
1070
1071=item Bad argspec (%s)
1072
1073(F) Only identifier names, literals, spaces and commas
1074are allowed in argspecs.
1075
1076=item Missing comma separator in argspec
1077
1078(F) Identifiers in an argspec must be separated with ",".
1079
1080=back
1081
1082=head1 SEE ALSO
1083
1084L<HTML::Entities>, L<HTML::PullParser>, L<HTML::TokeParser>, L<HTML::HeadParser>,
1085L<HTML::LinkExtor>, L<HTML::Form>
1086
1087L<HTML::TreeBuilder> (part of the I<HTML-Tree> distribution)
1088
1089http://www.w3.org/TR/REC-html40
1090
1091More information about marked sections and processing instructions may
1092be found at C<http://www.sgml.u-net.com/book/sgml-8.htm>.
1093
1094=head1 COPYRIGHT
1095
1096 Copyright 1996-2003 Gisle Aas. All rights reserved.
1097 Copyright 1999-2000 Michael A. Chase. All rights reserved.
1098
1099This library is free software; you can redistribute it and/or
1100modify it under the same terms as Perl itself.
1101
1102=cut