Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | package HTML::Parser; |
2 | ||
3 | # Copyright 1996-2003, Gisle Aas. | |
4 | # Copyright 1999-2000, Michael A. Chase. | |
5 | # | |
6 | # This library is free software; you can redistribute it and/or | |
7 | # modify it under the same terms as Perl itself. | |
8 | ||
9 | use strict; | |
10 | use vars qw($VERSION @ISA); | |
11 | ||
12 | $VERSION = '3.28'; # $Date: 2003/04/17 03:45:34 $ | |
13 | ||
14 | require HTML::Entities; | |
15 | ||
16 | require DynaLoader; | |
17 | @ISA=qw(DynaLoader); | |
18 | HTML::Parser->bootstrap($VERSION); | |
19 | ||
20 | ||
21 | sub new | |
22 | { | |
23 | my $class = shift; | |
24 | my $self = bless {}, $class; | |
25 | return $self->init(@_); | |
26 | } | |
27 | ||
28 | ||
29 | sub init | |
30 | { | |
31 | my $self = shift; | |
32 | $self->_alloc_pstate; | |
33 | ||
34 | my %arg = @_; | |
35 | my $api_version = delete $arg{api_version} || (@_ ? 3 : 2); | |
36 | if ($api_version >= 4) { | |
37 | require Carp; | |
38 | Carp::croak("API version $api_version not supported " . | |
39 | "by HTML::Parser $VERSION"); | |
40 | } | |
41 | ||
42 | if ($api_version < 3) { | |
43 | # Set up method callbacks compatible with HTML-Parser-2.xx | |
44 | $self->handler(text => "text", "self,text,is_cdata"); | |
45 | $self->handler(end => "end", "self,tagname,text"); | |
46 | $self->handler(process => "process", "self,token0,text"); | |
47 | $self->handler(start => "start", | |
48 | "self,tagname,attr,attrseq,text"); | |
49 | ||
50 | $self->handler(comment => | |
51 | sub { | |
52 | my($self, $tokens) = @_; | |
53 | for (@$tokens) { | |
54 | $self->comment($_); | |
55 | } | |
56 | }, "self,tokens"); | |
57 | ||
58 | $self->handler(declaration => | |
59 | sub { | |
60 | my $self = shift; | |
61 | $self->declaration(substr($_[0], 2, -1)); | |
62 | }, "self,text"); | |
63 | } | |
64 | ||
65 | if (my $h = delete $arg{handlers}) { | |
66 | $h = {@$h} if ref($h) eq "ARRAY"; | |
67 | while (my($event, $cb) = each %$h) { | |
68 | $self->handler($event => @$cb); | |
69 | } | |
70 | } | |
71 | ||
72 | # In the end we try to assume plain attribute or handler | |
73 | while (my($option, $val) = each %arg) { | |
74 | if ($option =~ /^(\w+)_h$/) { | |
75 | $self->handler($1 => @$val); | |
76 | } | |
77 | elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) { | |
78 | require Carp; | |
79 | Carp::croak("Bad constructor option '$option'"); | |
80 | } | |
81 | else { | |
82 | $self->$option($val); | |
83 | } | |
84 | } | |
85 | ||
86 | return $self; | |
87 | } | |
88 | ||
89 | ||
90 | sub parse_file | |
91 | { | |
92 | my($self, $file) = @_; | |
93 | my $opened; | |
94 | if (!ref($file) && ref(\$file) ne "GLOB") { | |
95 | # Assume $file is a filename | |
96 | local(*F); | |
97 | open(F, $file) || return undef; | |
98 | binmode(F); # should we? good for byte counts | |
99 | $opened++; | |
100 | $file = *F; | |
101 | } | |
102 | my $chunk = ''; | |
103 | while (read($file, $chunk, 512)) { | |
104 | $self->parse($chunk) || last; | |
105 | } | |
106 | close($file) if $opened; | |
107 | $self->eof; | |
108 | } | |
109 | ||
110 | ||
111 | sub netscape_buggy_comment # legacy | |
112 | { | |
113 | my $self = shift; | |
114 | require Carp; | |
115 | Carp::carp("netscape_buggy_comment() is deprecated. " . | |
116 | "Please use the strict_comment() method instead"); | |
117 | my $old = !$self->strict_comment; | |
118 | $self->strict_comment(!shift) if @_; | |
119 | return $old; | |
120 | } | |
121 | ||
122 | # set up method stubs | |
123 | sub text { } | |
124 | *start = \&text; | |
125 | *end = \&text; | |
126 | *comment = \&text; | |
127 | *declaration = \&text; | |
128 | *process = \&text; | |
129 | ||
130 | 1; | |
131 | ||
132 | __END__ | |
133 | ||
134 | ||
135 | =head1 NAME | |
136 | ||
137 | HTML::Parser - HTML parser class | |
138 | ||
139 | =head1 SYNOPSIS | |
140 | ||
141 | use HTML::Parser (); | |
142 | ||
143 | # Create parser object | |
144 | $p = HTML::Parser->new( api_version => 3, | |
145 | start_h => [\&start, "tagname, attr"], | |
146 | end_h => [\&end, "tagname"], | |
147 | marked_sections => 1, | |
148 | ); | |
149 | ||
150 | # Parse document text chunk by chunk | |
151 | $p->parse($chunk1); | |
152 | $p->parse($chunk2); | |
153 | #... | |
154 | $p->eof; # signal end of document | |
155 | ||
156 | # Parse directly from file | |
157 | $p->parse_file("foo.html"); | |
158 | # or | |
159 | open(F, "foo.html") || die; | |
160 | $p->parse_file(*F); | |
161 | ||
162 | HTML::Parser version 2 style subclassing and method callbacks: | |
163 | ||
164 | { | |
165 | package MyParser; | |
166 | use base 'HTML::Parser'; | |
167 | ||
168 | sub start { | |
169 | my($self, $tagname, $attr, $attrseq, $origtext) = @_; | |
170 | #... | |
171 | } | |
172 | ||
173 | sub end { | |
174 | my($self, $tagname, $origtext) = @_; | |
175 | #... | |
176 | } | |
177 | ||
178 | sub text { | |
179 | my($self, $origtext, $is_cdata) = @_; | |
180 | #... | |
181 | } | |
182 | } | |
183 | ||
184 | my $p = MyParser->new; | |
185 | $p->parse_file("foo.html"); | |
186 | ||
187 | =head1 DESCRIPTION | |
188 | ||
189 | Objects of the C<HTML::Parser> class will recognize markup and | |
190 | separate it from plain text (alias data content) in HTML | |
191 | documents. As different kinds of markup and text are recognized, the | |
192 | corresponding event handlers are invoked. | |
193 | ||
194 | C<HTML::Parser> in not a generic SGML parser. We have tried to | |
195 | make it able to deal with the HTML that is actually "out there", and | |
196 | it normally parses as closely as possible to the way the popular web | |
197 | browsers do it instead of strictly following one of the many HTML | |
198 | specifications from W3C. Where there is disagreement there is often | |
199 | an option that you can enable to get the official behaviour. | |
200 | ||
201 | The document to be parsed may be supplied in arbitrary chunks. This | |
202 | makes on-the-fly parsing as documents are received from the network | |
203 | possible. | |
204 | ||
205 | If event driven parsing does not feel right for your application, you | |
206 | might want to use C<HTML::PullParser>. It is a | |
207 | C<HTML::Parser> subclass that allows a more conventional program | |
208 | structure. | |
209 | ||
210 | ||
211 | =head1 METHODS | |
212 | ||
213 | The following method is used to construct a new C<HTML::Parser> object: | |
214 | ||
215 | =over | |
216 | ||
217 | =item $p = HTML::Parser->new( %options_and_handlers ) | |
218 | ||
219 | This class method creates a new C<HTML::Parser> object and | |
220 | returns it. Key/value pair arguments may be provided to assign event | |
221 | handlers or initialize parser options. The handlers and parser | |
222 | options can also be set or modified later by method calls described below. | |
223 | ||
224 | If a top level key is in the form "<event>_h" (e.g., "text_h"} then it | |
225 | assigns a handler to that event, otherwise it initializes a parser | |
226 | option. The event handler specification value must be an array | |
227 | reference. Multiple handlers may also be assigned with the 'handlers | |
228 | => [%handlers]' option. See examples below. | |
229 | ||
230 | If new() is called without any arguments, it will create a parser that | |
231 | uses callback methods compatible with version 2 of C<HTML::Parser>. | |
232 | See the section on "version 2 compatibility" below for details. | |
233 | ||
234 | Special constructor option 'api_version => 2' can be used to | |
235 | initialize version 2 callbacks while still setting other options and | |
236 | handlers. The 'api_version => 3' option can be used if you don't want | |
237 | to set any options and don't want to fall back to v2 compatible | |
238 | mode. | |
239 | ||
240 | Examples: | |
241 | ||
242 | $p = HTML::Parser->new(api_version => 3, | |
243 | text_h => [ sub {...}, "dtext" ]); | |
244 | ||
245 | This creates a new parser object with a text event handler subroutine | |
246 | that receives the original text with general entities decoded. | |
247 | ||
248 | $p = HTML::Parser->new(api_version => 3, | |
249 | start_h => [ 'my_start', "self,tokens" ]); | |
250 | ||
251 | This creates a new parser object with a start event handler method | |
252 | that receives the $p and the tokens array. | |
253 | ||
254 | $p = HTML::Parser->new(api_version => 3, | |
255 | handlers => { text => [\@array, "event,text"], | |
256 | comment => [\@array, "event,text"], | |
257 | }); | |
258 | ||
259 | This creates a new parser object that stores the event type and the | |
260 | original text in @array for text and comment events. | |
261 | ||
262 | =back | |
263 | ||
264 | The following methods feed the HTML document | |
265 | to the C<HTML::Parser> object: | |
266 | ||
267 | =over | |
268 | ||
269 | =item $p->parse( $string ) | |
270 | ||
271 | Parse $string as the next chunk of the HTML document. The return | |
272 | value is normally a reference to the parser object (i.e. $p). | |
273 | Handlers invoked should not attempt modify the $string in-place until | |
274 | $p->parse returns. | |
275 | ||
276 | If an invoked event handler aborts parsing by calling $p->eof, then | |
277 | $p->parse() will return a FALSE value. | |
278 | ||
279 | =item $p->parse( $code_ref ) | |
280 | ||
281 | If a code reference is passed in as the argument to parse then the | |
282 | chunks to parse is obtained by invoking this function repeatedly. | |
283 | Parsing continues until the function returns an empty (or undefined) | |
284 | result. When this happens $p->eof is automatically signalled. | |
285 | ||
286 | Parsing will also abort if one of the event handlers call $p->eof. | |
287 | ||
288 | The effect of this is the same as: | |
289 | ||
290 | while (1) { | |
291 | my $chunk = &$code_ref(); | |
292 | if (!defined($chunk) || !length($chunk)) { | |
293 | $p->eof; | |
294 | return $p; | |
295 | } | |
296 | $p->parse($chunk) || return undef; | |
297 | } | |
298 | ||
299 | But it is more efficient as this loop runs internally in XS code. | |
300 | ||
301 | =item $p->parse_file( $file ) | |
302 | ||
303 | Parse text directly from a file. The $file argument can be a | |
304 | filename, an open file handle, or a reference to a an open file | |
305 | handle. | |
306 | ||
307 | If $file contains a filename and the file can't be opened, then the | |
308 | method returns an undefined value and $! tells why it failed. | |
309 | Otherwise the return value is a reference to the parser object. | |
310 | ||
311 | If a file handle is passed as the $file argument, then the file will | |
312 | normally be read until EOF, but not closed. | |
313 | ||
314 | If an invoked event handler aborts parsing by calling $p->eof, | |
315 | then $p->parse_file() may not have read the entire file. | |
316 | ||
317 | On systems with multi-byte line terminators, the values passed for the | |
318 | offset and length argspecs may be too low if parse_file() is called on | |
319 | a file handle that is not in binary mode. | |
320 | ||
321 | If a filename is passed in, then parse_file() will open the file in | |
322 | binary mode. | |
323 | ||
324 | =item $p->eof | |
325 | ||
326 | Signals the end of the HTML document. Calling the $p->eof method | |
327 | outside a handler callback will flush any remaining buffered text | |
328 | (which triggers the C<text> event if there is any remaining text). | |
329 | ||
330 | Calling $p->eof inside a handler will terminate parsing at that point | |
331 | and cause $p->parse to return a FALSE value. This also terminates | |
332 | parsing by $p->parse_file(). | |
333 | ||
334 | After $p->eof has been called, the parse() and parse_file() methods | |
335 | can be invoked to feed new documents with the parser object. | |
336 | ||
337 | The return value from eof() is a reference to the parser object. | |
338 | ||
339 | =back | |
340 | ||
341 | ||
342 | Most parser options are controlled by boolean attributes. | |
343 | Each boolean attribute is enabled by calling the corresponding method | |
344 | with a TRUE argument and disabled with a FALSE argument. The | |
345 | attribute value is left unchanged if no argument is given. The return | |
346 | value from each method is the old attribute value. | |
347 | ||
348 | Methods that can be used to get and/or set parser options are: | |
349 | ||
350 | =over | |
351 | ||
352 | =item $p->strict_comment( [$bool] ) | |
353 | ||
354 | By default, comments are terminated by the first occurrence of "-->". | |
355 | This is the behaviour of most popular browsers (like Netscape and | |
356 | MSIE), but it is not correct according to the official HTML | |
357 | standard. Officially, you need an even number of "--" tokens before | |
358 | the closing ">" is recognized and there may not be anything but | |
359 | whitespace between an even and an odd "--". | |
360 | ||
361 | The official behaviour is enabled by enabling this attribute. | |
362 | ||
363 | =item $p->strict_names( [$bool] ) | |
364 | ||
365 | By default, almost anything is allowed in tag and attribute names. | |
366 | This is the behaviour of most popular browsers and allows us to parse | |
367 | some broken tags with invalid attr values like: | |
368 | ||
369 | <IMG SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0> | |
370 | ||
371 | By default, "LIST]" is parsed as a boolean attribute, not as | |
372 | part of the ALT value as was clearly intended. This is also what | |
373 | Netscape sees. | |
374 | ||
375 | The official behaviour is enabled by enabling this attribute. If | |
376 | enabled, it will cause the tag above to be reported as text | |
377 | since "LIST]" is not a legal attribute name. | |
378 | ||
379 | =item $p->boolean_attribute_value( $val ) | |
380 | ||
381 | This method sets the value reported for boolean attributes inside HTML | |
382 | start tags. By default, the name of the attribute is also used as its | |
383 | value. This affects the values reported for C<tokens> and C<attr> | |
384 | argspecs. | |
385 | ||
386 | =item $p->xml_mode( [$bool] ) | |
387 | ||
388 | Enabling this attribute changes the parser to allow some XML | |
389 | constructs such as I<empty element tags> and I<XML processing | |
390 | instructions>. It disables forcing tag and attribute names to lower | |
391 | case when they are reported by the C<tagname> and C<attr> argspecs, | |
392 | and suppress special treatment of elements that are parsed as CDATA | |
393 | for HTML. | |
394 | ||
395 | I<Empty element tags> look like start tags, but end with the character | |
396 | sequence "/>". When recognized by C<HTML::Parser> they cause an | |
397 | artificial end event in addition to the start event. The C<text> for | |
398 | the artificial end event will be empty and the C<tokenpos> array will | |
399 | be undefined even though the only element in the token array will have | |
400 | the correct tag name. | |
401 | ||
402 | I<XML processing instructions> are terminated by "?>" instead of a | |
403 | simple ">" as is the case for HTML. | |
404 | ||
405 | =item $p->unbroken_text( [$bool] ) | |
406 | ||
407 | By default, blocks of text are given to the text handler as soon as | |
408 | possible (but the parser makes sure to always break text at the | |
409 | boundary between whitespace and non-whitespace so single words and | |
410 | entities always can be decoded safely). This might create breaks that | |
411 | make it hard to do transformations on the text. When this attribute is | |
412 | enabled, blocks of text are always reported in one piece. This will | |
413 | delay the text event until the following (non-text) event has been | |
414 | recognized by the parser. | |
415 | ||
416 | Note that the C<offset> argspec will give you the offset of the first | |
417 | segment of text and C<length> is the combined length of the segments. | |
418 | Since there might be ignored tags in between, these numbers can't be | |
419 | used to directly index in the original document file. | |
420 | ||
421 | =item $p->marked_sections( [$bool] ) | |
422 | ||
423 | By default, section markings like <![CDATA[...]]> are treated like | |
424 | ordinary text. When this attribute is enabled section markings are | |
425 | honoured. | |
426 | ||
427 | There are currently no events associated with the marked section | |
428 | markup, but the text can be returned as C<skipped_text>. | |
429 | ||
430 | =item $p->attr_encoded( [$bool] ) | |
431 | ||
432 | By default, the C<attr> and C<@attr> argspecs will have general | |
433 | entities for attribute values decoded. Enabling this attribute leaves | |
434 | entities alone. | |
435 | ||
436 | =item $p->case_sensitive( [$bool] ) | |
437 | ||
438 | By default, tagnames and attribute names are down-cased. Enabling this | |
439 | attribute leave them as found in the HTML source document. | |
440 | ||
441 | =back | |
442 | ||
443 | As markup and text is recognized, handlers are invoked. The following | |
444 | method is used to set up handlers for different events: | |
445 | ||
446 | =over | |
447 | ||
448 | =item $p->handler( event => \&subroutine, argspec ) | |
449 | ||
450 | =item $p->handler( event => method_name, argspec ) | |
451 | ||
452 | =item $p->handler( event => \@accum, argspec ) | |
453 | ||
454 | =item $p->handler( event => "" ); | |
455 | ||
456 | =item $p->handler( event => undef ); | |
457 | ||
458 | =item $p->handler( event ); | |
459 | ||
460 | This method assigns a subroutine, method, or array to handle an event. | |
461 | ||
462 | Event is one of C<text>, C<start>, C<end>, C<declaration>, C<comment>, | |
463 | C<process>, C<start_document>, C<end_document> or C<default>. | |
464 | ||
465 | I<Subroutine> is a reference to a subroutine which is called to handle | |
466 | the event. | |
467 | ||
468 | I<Method_name> is the name of a method of $p which is called to handle | |
469 | the event. | |
470 | ||
471 | I<Accum> is a array that will hold the event information as | |
472 | sub-arrays. | |
473 | ||
474 | If the second argument is "", the event is ignored. | |
475 | If it is undef, the default handler is invoked for the event. | |
476 | ||
477 | I<Argspec> is a string that describes the information to be reported | |
478 | for the event. Any requested information that does not apply to a | |
479 | specific event is passed as C<undef>. If argspec is omitted, then it | |
480 | is left unchanged since last update. | |
481 | ||
482 | The return value from $p->handle is the old callback routine or a | |
483 | reference to the accumulator array. | |
484 | ||
485 | Any return values from handler callback routines/methods are always | |
486 | ignored. A handler callback can request parsing to be aborted by | |
487 | invoking the $p->eof method. A handler callback is not allowed to | |
488 | invoke the $p->parse() or $p->parse_file() method. An exception will | |
489 | be raised if it tries. | |
490 | ||
491 | Examples: | |
492 | ||
493 | $p->handler(start => "start", 'self, attr, attrseq, text' ); | |
494 | ||
495 | This causes the "start" method of object $p to be called for 'start' events. | |
496 | The callback signature is $p->start(\%attr, \@attr_seq, $text). | |
497 | ||
498 | $p->handler(start => \&start, 'attr, attrseq, text' ); | |
499 | ||
500 | This causes subroutine start() to be called for 'start' events. | |
501 | The callback signature is start(\%attr, \@attr_seq, $text). | |
502 | ||
503 | $p->handler(start => \@accum, '"S", attr, attrseq, text' ); | |
504 | ||
505 | This causes 'start' event information to be saved in @accum. | |
506 | The array elements will be ['S', \%attr, \@attr_seq, $text]. | |
507 | ||
508 | $p->handler(start => ""); | |
509 | ||
510 | This causes 'start' events to be ignored. It also supresses | |
511 | invokations of any default handler for start events. It is in most | |
512 | cases equivalent to $p->handler(start => sub {}), but is more | |
513 | efficient. It is different from the empty-sub-handler in that | |
514 | C<skipped_text> is not reset by it. | |
515 | ||
516 | $p->handler(start => undef); | |
517 | ||
518 | This causes no handler to be assosiated with start events. | |
519 | If there is a default handler it will be invoked. | |
520 | ||
521 | =back | |
522 | ||
523 | Filters based on tags can be set up to limit the number of events | |
524 | reported. The main bottleneck during parsing is often the huge number | |
525 | of callbacks made from the parser. Applying filters can improve | |
526 | performance significantly. | |
527 | ||
528 | The following methods control filters: | |
529 | ||
530 | =over | |
531 | ||
532 | =item $p->ignore_tags( TAG, ... ) | |
533 | ||
534 | Any C<start> and C<end> events involving any of the tags given are | |
535 | suppressed. | |
536 | ||
537 | =item $p->report_tags( TAG, ... ) | |
538 | ||
539 | Any C<start> and C<end> events involving any of the tags I<not> given | |
540 | are suppressed. | |
541 | ||
542 | =item $p->ignore_elements( TAG, ... ) | |
543 | ||
544 | Both the C<start> and the C<end> event as well as any events that | |
545 | would be reported in between are suppressed. The ignored elements can | |
546 | contain nested occurences of itself. Example: | |
547 | ||
548 | $p->ignore_elements(qw(script style)); | |
549 | ||
550 | The C<script> and C<style> tags will always nest properly since their | |
551 | content is parsed in CDATA mode. For most other tags | |
552 | C<ignore_elements> must be used with caution since HTML is often not | |
553 | I<well formed>. | |
554 | ||
555 | =back | |
556 | ||
557 | =head2 Argspec | |
558 | ||
559 | Argspec is a string containing a comma separated list that describes | |
560 | the information reported by the event. The following argspec | |
561 | identifier names can be used: | |
562 | ||
563 | =over | |
564 | ||
565 | =item C<self> | |
566 | ||
567 | Self causes the current object to be passed to the handler. If the | |
568 | handler is a method, this must be the first element in the argspec. | |
569 | ||
570 | An alternative to passing self as an argspec is to register closures | |
571 | that capture $self by themselves as handlers. Unfortunately this | |
572 | creates a circular references which prevents the HTML::Parser object | |
573 | from being garbage collected. Using the C<self> argspec avoids this | |
574 | problem. | |
575 | ||
576 | =item C<tokens> | |
577 | ||
578 | Tokens causes a reference to an array of token strings to be passed. | |
579 | The strings are exactly as they were found in the original text, | |
580 | no decoding or case changes are applied. | |
581 | ||
582 | For C<declaration> events, the array contains each word, comment, and | |
583 | delimited string starting with the declaration type. | |
584 | ||
585 | For C<comment> events, this contains each sub-comment. If | |
586 | $p->strict_comments is disabled, there will be only one sub-comment. | |
587 | ||
588 | For C<start> events, this contains the original tag name followed by | |
589 | the attribute name/value pairs. The value of boolean attributes will | |
590 | be either the value set by $p->boolean_attribute_value or the | |
591 | attribute name if no value has been set by | |
592 | $p->boolean_attribute_value. | |
593 | ||
594 | For C<end> events, this contains the original tag name (always one token). | |
595 | ||
596 | For C<process> events, this contains the process instructions (always one | |
597 | token). | |
598 | ||
599 | This passes C<undef> for C<text> events. | |
600 | ||
601 | =item C<tokenpos> | |
602 | ||
603 | Tokenpos causes a reference to an array of token positions to be | |
604 | passed. For each string that appears in C<tokens>, this array | |
605 | contains two numbers. The first number is the offset of the start of | |
606 | the token in the original C<text> and the second number is the length | |
607 | of the token. | |
608 | ||
609 | Boolean attributes in a C<start> event will have (0,0) for the | |
610 | attribute value offset and length. | |
611 | ||
612 | This passes undef if there are no tokens in the event (e.g., C<text>) | |
613 | and for artifical C<end> events triggered by empty element tags. | |
614 | ||
615 | If you are using these offsets and lengths to modify C<text>, you | |
616 | should either work from right to left, or be very careful to calculate | |
617 | the changes to the offsets. | |
618 | ||
619 | =item C<token0> | |
620 | ||
621 | Token0 causes the original text of the first token string to be | |
622 | passed. This should always be the same as $tokens->[0]. | |
623 | ||
624 | For C<declaration> events, this is the declaration type. | |
625 | ||
626 | For C<start> and C<end> events, this is the tag name. | |
627 | ||
628 | For C<process> and non-strict C<comment> events, this is everything | |
629 | inside the tag. | |
630 | ||
631 | This passes undef if there are no tokens in the event. | |
632 | ||
633 | =item C<tagname> | |
634 | ||
635 | This is the element name (or I<generic identifier> in SGML jargon) for | |
636 | start and end tags. Since HTML is case insensitive this name is | |
637 | forced to lower case to ease string matching. | |
638 | ||
639 | Since XML is case sensitive, the tagname case is not changed when | |
640 | C<xml_mode> is enabled. Same happens if the C<case_sensitive> attribute | |
641 | is set. | |
642 | ||
643 | The declaration type of declaration elements is also passed as a tagname, | |
644 | even if that is a bit strange. | |
645 | In fact, in the current implementation tagname is | |
646 | identical to C<token0> except that the name may be forced to lower case. | |
647 | ||
648 | =item C<tag> | |
649 | ||
650 | Same as C<tagname>, but prefixed with "/" if it belongs to an C<end> | |
651 | event and "!" for a declaration. The C<tag> does not have any prefix | |
652 | for C<start> events, and is in this case identical to C<tagname>. | |
653 | ||
654 | =item C<attr> | |
655 | ||
656 | Attr causes a reference to a hash of attribute name/value pairs to be | |
657 | passed. | |
658 | ||
659 | Boolean attributes' values are either the value set by | |
660 | $p->boolean_attribute_value or the attribute name if no value has been | |
661 | set by $p->boolean_attribute_value. | |
662 | ||
663 | This passes undef except for C<start> events. | |
664 | ||
665 | Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute | |
666 | names are forced to lower case. | |
667 | ||
668 | General entities are decoded in the attribute values and | |
669 | one layer of matching quotes enclosing the attribute values are removed. | |
670 | ||
671 | =item C<attrseq> | |
672 | ||
673 | Attrseq causes a reference to an array of attribute names to be | |
674 | passed. This can be useful if you want to walk the C<attr> hash in | |
675 | the original sequence. | |
676 | ||
677 | This passes undef except for C<start> events. | |
678 | ||
679 | Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute | |
680 | names are forced to lower case. | |
681 | ||
682 | =item C<@attr> | |
683 | ||
684 | Basically same as C<attr>, but keys and values are passed as | |
685 | individual arguments and the original sequence of the attributes is | |
686 | kept. The parameters passed will be the same as the @attr calculated | |
687 | here: | |
688 | ||
689 | @attr = map { $_ => $attr->{$_} } @$attrseq; | |
690 | ||
691 | assuming $attr and $attrseq here are the hash and array passed as the | |
692 | result of C<attr> and C<attrseq> argspecs. | |
693 | ||
694 | This pass no values for events besides C<start>. | |
695 | ||
696 | =item C<text> | |
697 | ||
698 | Text causes the source text (including markup element delimiters) to be | |
699 | passed. | |
700 | ||
701 | =item C<dtext> | |
702 | ||
703 | Dtext causes the decoded text to be passed. General entities are | |
704 | automatically decoded unless the event was inside a CDATA section or | |
705 | was between literal start and end tags (C<script>, C<style>, C<xmp>, | |
706 | and C<plaintext>). | |
707 | ||
708 | The Unicode character set is assumed for entity decoding. With perl | |
709 | version < 5.7.1 only the Latin1 range is supported, and entities for | |
710 | characters outside the 0..255 range is left unchanged. | |
711 | ||
712 | This passes undef except for C<text> events. | |
713 | ||
714 | =item C<is_cdata> | |
715 | ||
716 | Is_cdata causes a TRUE value to be passed if the event is inside a CDATA | |
717 | section or is between literal start and end tags (C<script>, | |
718 | C<style>, C<xmp>, and C<plaintext>). | |
719 | ||
720 | When the flag is FALSE for a text event, then you should normally | |
721 | either use C<dtext> or decode the entities yourself before the text is | |
722 | processed further. | |
723 | ||
724 | =item C<skipped_text> | |
725 | ||
726 | Skipped_text returns the concatenated text of all the events that has | |
727 | been skipped since the last time an event was reported. Events might | |
728 | be skipped because no handler is registered for them or because some | |
729 | filter applies. Skipped text also include marked section markup, | |
730 | since there is no events that can catch them. | |
731 | ||
732 | If an C<"">-handler is registered for an event, then the text for this | |
733 | event is not included in C<skipped_text>. Skipped text both before | |
734 | and after the C<"">-event is included in the next reported | |
735 | C<skipped_text>. | |
736 | ||
737 | =item C<offset> | |
738 | ||
739 | Offset causes the byte position in the HTML document of the start of | |
740 | the event to be passed. The first byte in the document is 0. | |
741 | ||
742 | =item C<length> | |
743 | ||
744 | Length causes the number of bytes of the source text of the event to | |
745 | be passed. | |
746 | ||
747 | =item C<offset_end> | |
748 | ||
749 | Offset_end causes the byte position in the HTML document of the end of | |
750 | the event to be passed. This is the same as C<offset> + C<length>. | |
751 | ||
752 | =item C<event> | |
753 | ||
754 | Event causes the event name to be passed. | |
755 | ||
756 | The event name is one of C<text>, C<start>, C<end>, C<declaration>, | |
757 | C<comment>, C<process>, C<start_document>, C<end_document> or C<default>. | |
758 | ||
759 | =item C<line> | |
760 | ||
761 | Line causes the line number of the start of the event to be passed. | |
762 | The first line in the document is 1. Line counting doesn't start | |
763 | until at least one handler requests this value to be reported. | |
764 | ||
765 | =item C<column> | |
766 | ||
767 | Column causes the column number of the start of the event to be passed. | |
768 | The first column on a line is 0. | |
769 | ||
770 | =item C<'...'> | |
771 | ||
772 | A literal string of 0 to 255 characters enclosed | |
773 | in single (') or double (") quotes is passed as entered. | |
774 | ||
775 | =item C<undef> | |
776 | ||
777 | Pass an undefined value. Useful as padding where the same handler | |
778 | routine is registered for multiple events. | |
779 | ||
780 | =back | |
781 | ||
782 | The whole argspec string can be wrapped up in C<'@{...}'> to signal | |
783 | that resulting event array should be flatten. This only makes a | |
784 | difference if an array reference is used as the handler target. | |
785 | Consider this example: | |
786 | ||
787 | $p->handler(text => [], 'text'); | |
788 | $p->handler(text => [], '@{text}']); | |
789 | ||
790 | With two text events; C<"foo">, C<"bar">; then the first one will end | |
791 | up with [["foo"], ["bar"]] and the second one with ["foo", "bar"] in | |
792 | the handler target array. | |
793 | ||
794 | ||
795 | =head2 Events | |
796 | ||
797 | Handlers for the following events can be registered: | |
798 | ||
799 | =over | |
800 | ||
801 | =item C<text> | |
802 | ||
803 | This event is triggered when plain text (characters) is recognized. | |
804 | The text may contain multiple lines. A sequence of text may be broken | |
805 | between several text events unless $p->unbroken_text is enabled. | |
806 | ||
807 | The parser will make sure that it does not break a word or a sequence | |
808 | of whitespace between two text events. | |
809 | ||
810 | =item C<start> | |
811 | ||
812 | This event is triggered when a start tag is recognized. | |
813 | ||
814 | Example: | |
815 | ||
816 | <A HREF="http://www.perl.com/"> | |
817 | ||
818 | =item C<end> | |
819 | ||
820 | This event is triggered when an end tag is recognized. | |
821 | ||
822 | Example: | |
823 | ||
824 | </A> | |
825 | ||
826 | =item C<declaration> | |
827 | ||
828 | This event is triggered when a I<markup declaration> is recognized. | |
829 | ||
830 | For typical HTML documents, the only declaration you are | |
831 | likely to find is <!DOCTYPE ...>. | |
832 | ||
833 | Example: | |
834 | ||
835 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" | |
836 | "http://www.w3.org/TR/html40/strict.dtd"> | |
837 | ||
838 | DTDs inside <!DOCTYPE ...> will confuse HTML::Parser. | |
839 | ||
840 | =item C<comment> | |
841 | ||
842 | This event is triggered when a markup comment is recognized. | |
843 | ||
844 | Example: | |
845 | ||
846 | <!-- This is a comment -- -- So is this --> | |
847 | ||
848 | =item C<process> | |
849 | ||
850 | This event is triggered when a processing instructions markup is | |
851 | recognized. | |
852 | ||
853 | The format and content of processing instructions is system and | |
854 | application dependent. | |
855 | ||
856 | Examples: | |
857 | ||
858 | <? HTML processing instructions > | |
859 | <? XML processing instructions ?> | |
860 | ||
861 | =item C<start_document> | |
862 | ||
863 | This event is triggered before any other events for a new document. A | |
864 | handler for it can be used to initialize stuff. There is no document | |
865 | text associated with this event. | |
866 | ||
867 | =item C<end_document> | |
868 | ||
869 | This event is triggered when $p->eof called and after any remaining | |
870 | text is flushed. There is no document text associated with this event. | |
871 | ||
872 | =item C<default> | |
873 | ||
874 | This event is triggered for events that do not have a specific | |
875 | handler. You can set up a handler for this event to catch stuff you | |
876 | did not want to catch explicitly. | |
877 | ||
878 | =back | |
879 | ||
880 | =head1 VERSION 2 COMPATIBILITY | |
881 | ||
882 | When an C<HTML::Parser> object is constructed with no arguments, a set | |
883 | of handlers is automatically provided that is compatible with the old | |
884 | HTML::Parser version 2 callback methods. | |
885 | ||
886 | This is equivalent to the following method calls: | |
887 | ||
888 | $p->handler(start => "start", "self, tagname, attr, attrseq, text"); | |
889 | $p->handler(end => "end", "self, tagname, text"); | |
890 | $p->handler(text => "text", "self, text, is_cdata"); | |
891 | $p->handler(process => "process", "self, token0, text"); | |
892 | $p->handler(comment => | |
893 | sub { | |
894 | my($self, $tokens) = @_; | |
895 | for (@$tokens) {$self->comment($_);}}, | |
896 | "self, tokens"); | |
897 | $p->handler(declaration => | |
898 | sub { | |
899 | my $self = shift; | |
900 | $self->declaration(substr($_[0], 2, -1));}, | |
901 | "self, text"); | |
902 | ||
903 | Setup of these handlers can also be requested with the "api_version => | |
904 | 2" constructor option. | |
905 | ||
906 | =head1 SUBCLASSING | |
907 | ||
908 | The C<HTML::Parser> class is subclassable. Parser objects are plain | |
909 | hashes and C<HTML::Parser> reserves only hash keys that start with | |
910 | "_hparser". The parser state can be set up by invoking the init() | |
911 | method which takes the same arguments as new(). | |
912 | ||
913 | =head1 EXAMPLES | |
914 | ||
915 | The first simple example shows how you might strip out comments from | |
916 | an HTML document. We achieve this by setting up a comment handler that | |
917 | does nothing and a default handler that will print out anything else: | |
918 | ||
919 | use HTML::Parser; | |
920 | HTML::Parser->new(default_h => [sub { print shift }, 'text'], | |
921 | comment_h => [""], | |
922 | )->parse_file(shift || die) || die $!; | |
923 | ||
924 | An alternative implementation is: | |
925 | ||
926 | use HTML::Parser; | |
927 | HTML::Parser->new(end_document_h => [sub { print shift }, | |
928 | 'skipped_text'], | |
929 | comment_h => [""], | |
930 | )->parse_file(shift || die) || die $!; | |
931 | ||
932 | This will in most cases be much more efficient since only a single | |
933 | callback will be made. | |
934 | ||
935 | The next example prints out the text that is inside the <title> | |
936 | element of an HTML document. Here we start by setting up a start | |
937 | handler. When it sees the title start tag it enables a text handler | |
938 | that prints any text found and an end handler that will terminate | |
939 | parsing as soon as the title end tag is seen: | |
940 | ||
941 | use HTML::Parser (); | |
942 | ||
943 | sub start_handler | |
944 | { | |
945 | return if shift ne "title"; | |
946 | my $self = shift; | |
947 | $self->handler(text => sub { print shift }, "dtext"); | |
948 | $self->handler(end => sub { shift->eof if shift eq "title"; }, | |
949 | "tagname,self"); | |
950 | } | |
951 | ||
952 | my $p = HTML::Parser->new(api_version => 3); | |
953 | $p->handler( start => \&start_handler, "tagname,self"); | |
954 | $p->parse_file(shift || die) || die $!; | |
955 | print "\n"; | |
956 | ||
957 | More examples are found in the "eg/" directory of the C<HTML-Parser> | |
958 | distribution; the program C<hrefsub> shows how you can edit all links | |
959 | found in a document and C<htextsub> how to edid the text only; the | |
960 | program C<hstrip> shows how you can strip out certain tags/elements | |
961 | and/or attributes; and the program C<htext> show how to obtain the | |
962 | plain text, but not any script/style content. | |
963 | ||
964 | =head1 BUGS | |
965 | ||
966 | The <style> and <script> sections do not end with the first "</", but | |
967 | need the complete corresponding end tag. | |
968 | ||
969 | When the I<strict_comment> option is enabled, we still recognize | |
970 | comments where there is something other than whitespace between even | |
971 | and odd "--" markers. | |
972 | ||
973 | Once $p->boolean_attribute_value has been set, there is no way to | |
974 | restore the default behaviour. | |
975 | ||
976 | There is currently no way to get both quote characters | |
977 | into the same literal argspec. | |
978 | ||
979 | Empty tags, e.g. "<>" and "</>", are not recognized. SGML allows them | |
980 | to repeat the previous start tag or close the previous start tag | |
981 | respecitvely. | |
982 | ||
983 | NET tags, e.g. "code/.../" are not recognized. This is an SGML | |
984 | shorthand for "<code>...</code>". | |
985 | ||
986 | Unclosed start or end tags, e.g. "<tt<b>...</b</tt>" are not | |
987 | recognized. | |
988 | ||
989 | =head1 DIAGNOSTICS | |
990 | ||
991 | The following messages may be produced by HTML::Parser. The notation | |
992 | in this listing is the same as used in L<perldiag>: | |
993 | ||
994 | =over | |
995 | ||
996 | =item Not a reference to a hash | |
997 | ||
998 | (F) The object blessed into or subclassed from HTML::Parser is not a | |
999 | hash as required by the HTML::Parser methods. | |
1000 | ||
1001 | =item Bad signature in parser state object at %p | |
1002 | ||
1003 | (F) The _hparser_xs_state element does not refer to a valid state structure. | |
1004 | Something must have changed the internal value | |
1005 | stored in this hash element, or the memory has been overwritten. | |
1006 | ||
1007 | =item _hparser_xs_state element is not a reference | |
1008 | ||
1009 | (F) The _hparser_xs_state element has been destroyed. | |
1010 | ||
1011 | =item Can't find '_hparser_xs_state' element in HTML::Parser hash | |
1012 | ||
1013 | (F) The _hparser_xs_state element is missing from the parser hash. | |
1014 | It was either deleted, or not created when the object was created. | |
1015 | ||
1016 | =item API version %s not supported by HTML::Parser %s | |
1017 | ||
1018 | (F) The constructor option 'api_version' with an argument greater than | |
1019 | or equal to 4 is reserved for future extentions. | |
1020 | ||
1021 | =item Bad constructor option '%s' | |
1022 | ||
1023 | (F) An unknown constructor option key was passed to the new() or | |
1024 | init() methods. | |
1025 | ||
1026 | =item Parse loop not allowed | |
1027 | ||
1028 | (F) A handler invoked the parse() or parse_file() method. | |
1029 | This is not permitted. | |
1030 | ||
1031 | =item marked sections not supported | |
1032 | ||
1033 | (F) The $p->marked_sections() method was invoked in a HTML::Parser | |
1034 | module that was compiled without support for marked sections. | |
1035 | ||
1036 | =item Unknown boolean attribute (%d) | |
1037 | ||
1038 | (F) Something is wrong with the internal logic that set up aliases for | |
1039 | boolean attributes. | |
1040 | ||
1041 | =item Only code or array references allowed as handler | |
1042 | ||
1043 | (F) The second argument for $p->handler must be either a subroutine | |
1044 | reference, then name of a subroutine or method, or a reference to an | |
1045 | array. | |
1046 | ||
1047 | =item No handler for %s events | |
1048 | ||
1049 | (F) The first argument to $p->handler must be a valid event name; i.e. one | |
1050 | of "start", "end", "text", "process", "declaration" or "comment". | |
1051 | ||
1052 | =item Unrecognized identifier %s in argspec | |
1053 | ||
1054 | (F) The identifier is not a known argspec name. | |
1055 | Use one of the names mentioned in the argspec section above. | |
1056 | ||
1057 | =item Literal string is longer than 255 chars in argspec | |
1058 | ||
1059 | (F) The current implementation limits the length of literals in | |
1060 | an argspec to 255 characters. Make the literal shorter. | |
1061 | ||
1062 | =item Backslash reserved for literal string in argspec | |
1063 | ||
1064 | (F) The backslash character "\" is not allowed in argspec literals. | |
1065 | It is reserved to permit quoting inside a literal in a later version. | |
1066 | ||
1067 | =item Unterminated literal string in argspec | |
1068 | ||
1069 | (F) The terminating quote character for a literal was not found. | |
1070 | ||
1071 | =item Bad argspec (%s) | |
1072 | ||
1073 | (F) Only identifier names, literals, spaces and commas | |
1074 | are allowed in argspecs. | |
1075 | ||
1076 | =item Missing comma separator in argspec | |
1077 | ||
1078 | (F) Identifiers in an argspec must be separated with ",". | |
1079 | ||
1080 | =back | |
1081 | ||
1082 | =head1 SEE ALSO | |
1083 | ||
1084 | L<HTML::Entities>, L<HTML::PullParser>, L<HTML::TokeParser>, L<HTML::HeadParser>, | |
1085 | L<HTML::LinkExtor>, L<HTML::Form> | |
1086 | ||
1087 | L<HTML::TreeBuilder> (part of the I<HTML-Tree> distribution) | |
1088 | ||
1089 | http://www.w3.org/TR/REC-html40 | |
1090 | ||
1091 | More information about marked sections and processing instructions may | |
1092 | be found at C<http://www.sgml.u-net.com/book/sgml-8.htm>. | |
1093 | ||
1094 | =head1 COPYRIGHT | |
1095 | ||
1096 | Copyright 1996-2003 Gisle Aas. All rights reserved. | |
1097 | Copyright 1999-2000 Michael A. Chase. All rights reserved. | |
1098 | ||
1099 | This library is free software; you can redistribute it and/or | |
1100 | modify it under the same terms as Perl itself. | |
1101 | ||
1102 | =cut |