Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | .\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "Parser 3" | |
132 | .TH Parser 3 "2003-04-16" "perl v5.8.0" "User Contributed Perl Documentation" | |
133 | .SH "NAME" | |
134 | HTML::Parser \- HTML parser class | |
135 | .SH "SYNOPSIS" | |
136 | .IX Header "SYNOPSIS" | |
137 | .Vb 1 | |
138 | \& use HTML::Parser (); | |
139 | .Ve | |
140 | .PP | |
141 | .Vb 6 | |
142 | \& # Create parser object | |
143 | \& $p = HTML::Parser->new( api_version => 3, | |
144 | \& start_h => [\e&start, "tagname, attr"], | |
145 | \& end_h => [\e&end, "tagname"], | |
146 | \& marked_sections => 1, | |
147 | \& ); | |
148 | .Ve | |
149 | .PP | |
150 | .Vb 5 | |
151 | \& # Parse document text chunk by chunk | |
152 | \& $p->parse($chunk1); | |
153 | \& $p->parse($chunk2); | |
154 | \& #... | |
155 | \& $p->eof; # signal end of document | |
156 | .Ve | |
157 | .PP | |
158 | .Vb 5 | |
159 | \& # Parse directly from file | |
160 | \& $p->parse_file("foo.html"); | |
161 | \& # or | |
162 | \& open(F, "foo.html") || die; | |
163 | \& $p->parse_file(*F); | |
164 | .Ve | |
165 | .PP | |
166 | HTML::Parser version 2 style subclassing and method callbacks: | |
167 | .PP | |
168 | .Vb 3 | |
169 | \& { | |
170 | \& package MyParser; | |
171 | \& use base 'HTML::Parser'; | |
172 | .Ve | |
173 | .PP | |
174 | .Vb 4 | |
175 | \& sub start { | |
176 | \& my($self, $tagname, $attr, $attrseq, $origtext) = @_; | |
177 | \& #... | |
178 | \& } | |
179 | .Ve | |
180 | .PP | |
181 | .Vb 4 | |
182 | \& sub end { | |
183 | \& my($self, $tagname, $origtext) = @_; | |
184 | \& #... | |
185 | \& } | |
186 | .Ve | |
187 | .PP | |
188 | .Vb 5 | |
189 | \& sub text { | |
190 | \& my($self, $origtext, $is_cdata) = @_; | |
191 | \& #... | |
192 | \& } | |
193 | \& } | |
194 | .Ve | |
195 | .PP | |
196 | .Vb 2 | |
197 | \& my $p = MyParser->new; | |
198 | \& $p->parse_file("foo.html"); | |
199 | .Ve | |
200 | .SH "DESCRIPTION" | |
201 | .IX Header "DESCRIPTION" | |
202 | Objects of the \f(CW\*(C`HTML::Parser\*(C'\fR class will recognize markup and | |
203 | separate it from plain text (alias data content) in \s-1HTML\s0 | |
204 | documents. As different kinds of markup and text are recognized, the | |
205 | corresponding event handlers are invoked. | |
206 | .PP | |
207 | \&\f(CW\*(C`HTML::Parser\*(C'\fR in not a generic \s-1SGML\s0 parser. We have tried to | |
208 | make it able to deal with the \s-1HTML\s0 that is actually \*(L"out there\*(R", and | |
209 | it normally parses as closely as possible to the way the popular web | |
210 | browsers do it instead of strictly following one of the many \s-1HTML\s0 | |
211 | specifications from W3C. Where there is disagreement there is often | |
212 | an option that you can enable to get the official behaviour. | |
213 | .PP | |
214 | The document to be parsed may be supplied in arbitrary chunks. This | |
215 | makes on-the-fly parsing as documents are received from the network | |
216 | possible. | |
217 | .PP | |
218 | If event driven parsing does not feel right for your application, you | |
219 | might want to use \f(CW\*(C`HTML::PullParser\*(C'\fR. It is a | |
220 | \&\f(CW\*(C`HTML::Parser\*(C'\fR subclass that allows a more conventional program | |
221 | structure. | |
222 | .SH "METHODS" | |
223 | .IX Header "METHODS" | |
224 | The following method is used to construct a new \f(CW\*(C`HTML::Parser\*(C'\fR object: | |
225 | .ie n .IP "$p = HTML::Parser\->new( %options_and_handlers )" 4 | |
226 | .el .IP "$p = HTML::Parser\->new( \f(CW%options_and_handlers\fR )" 4 | |
227 | .IX Item "$p = HTML::Parser->new( %options_and_handlers )" | |
228 | This class method creates a new \f(CW\*(C`HTML::Parser\*(C'\fR object and | |
229 | returns it. Key/value pair arguments may be provided to assign event | |
230 | handlers or initialize parser options. The handlers and parser | |
231 | options can also be set or modified later by method calls described below. | |
232 | .Sp | |
233 | If a top level key is in the form \*(L"<event>_h\*(R" (e.g., \*(L"text_h\*(R"} then it | |
234 | assigns a handler to that event, otherwise it initializes a parser | |
235 | option. The event handler specification value must be an array | |
236 | reference. Multiple handlers may also be assigned with the 'handlers | |
237 | => [%handlers]' option. See examples below. | |
238 | .Sp | |
239 | If \fInew()\fR is called without any arguments, it will create a parser that | |
240 | uses callback methods compatible with version 2 of \f(CW\*(C`HTML::Parser\*(C'\fR. | |
241 | See the section on \*(L"version 2 compatibility\*(R" below for details. | |
242 | .Sp | |
243 | Special constructor option 'api_version => 2' can be used to | |
244 | initialize version 2 callbacks while still setting other options and | |
245 | handlers. The 'api_version => 3' option can be used if you don't want | |
246 | to set any options and don't want to fall back to v2 compatible | |
247 | mode. | |
248 | .Sp | |
249 | Examples: | |
250 | .Sp | |
251 | .Vb 2 | |
252 | \& $p = HTML::Parser->new(api_version => 3, | |
253 | \& text_h => [ sub {...}, "dtext" ]); | |
254 | .Ve | |
255 | .Sp | |
256 | This creates a new parser object with a text event handler subroutine | |
257 | that receives the original text with general entities decoded. | |
258 | .Sp | |
259 | .Vb 2 | |
260 | \& $p = HTML::Parser->new(api_version => 3, | |
261 | \& start_h => [ 'my_start', "self,tokens" ]); | |
262 | .Ve | |
263 | .Sp | |
264 | This creates a new parser object with a start event handler method | |
265 | that receives the \f(CW$p\fR and the tokens array. | |
266 | .Sp | |
267 | .Vb 4 | |
268 | \& $p = HTML::Parser->new(api_version => 3, | |
269 | \& handlers => { text => [\e@array, "event,text"], | |
270 | \& comment => [\e@array, "event,text"], | |
271 | \& }); | |
272 | .Ve | |
273 | .Sp | |
274 | This creates a new parser object that stores the event type and the | |
275 | original text in \f(CW@array\fR for text and comment events. | |
276 | .PP | |
277 | The following methods feed the \s-1HTML\s0 document | |
278 | to the \f(CW\*(C`HTML::Parser\*(C'\fR object: | |
279 | .ie n .IP "$p\->parse( $string )" 4 | |
280 | .el .IP "$p\->parse( \f(CW$string\fR )" 4 | |
281 | .IX Item "$p->parse( $string )" | |
282 | Parse \f(CW$string\fR as the next chunk of the \s-1HTML\s0 document. The return | |
283 | value is normally a reference to the parser object (i.e. \f(CW$p\fR). | |
284 | Handlers invoked should not attempt modify the \f(CW$string\fR in-place until | |
285 | \&\f(CW$p\fR\->parse returns. | |
286 | .Sp | |
287 | If an invoked event handler aborts parsing by calling \f(CW$p\fR\->eof, then | |
288 | \&\f(CW$p\fR\->\fIparse()\fR will return a \s-1FALSE\s0 value. | |
289 | .ie n .IP "$p\->parse( $code_ref )" 4 | |
290 | .el .IP "$p\->parse( \f(CW$code_ref\fR )" 4 | |
291 | .IX Item "$p->parse( $code_ref )" | |
292 | If a code reference is passed in as the argument to parse then the | |
293 | chunks to parse is obtained by invoking this function repeatedly. | |
294 | Parsing continues until the function returns an empty (or undefined) | |
295 | result. When this happens \f(CW$p\fR\->eof is automatically signalled. | |
296 | .Sp | |
297 | Parsing will also abort if one of the event handlers call \f(CW$p\fR\->eof. | |
298 | .Sp | |
299 | The effect of this is the same as: | |
300 | .Sp | |
301 | .Vb 8 | |
302 | \& while (1) { | |
303 | \& my $chunk = &$code_ref(); | |
304 | \& if (!defined($chunk) || !length($chunk)) { | |
305 | \& $p->eof; | |
306 | \& return $p; | |
307 | \& } | |
308 | \& $p->parse($chunk) || return undef; | |
309 | \& } | |
310 | .Ve | |
311 | .Sp | |
312 | But it is more efficient as this loop runs internally in \s-1XS\s0 code. | |
313 | .ie n .IP "$p\->parse_file( $file )" 4 | |
314 | .el .IP "$p\->parse_file( \f(CW$file\fR )" 4 | |
315 | .IX Item "$p->parse_file( $file )" | |
316 | Parse text directly from a file. The \f(CW$file\fR argument can be a | |
317 | filename, an open file handle, or a reference to a an open file | |
318 | handle. | |
319 | .Sp | |
320 | If \f(CW$file\fR contains a filename and the file can't be opened, then the | |
321 | method returns an undefined value and $! tells why it failed. | |
322 | Otherwise the return value is a reference to the parser object. | |
323 | .Sp | |
324 | If a file handle is passed as the \f(CW$file\fR argument, then the file will | |
325 | normally be read until \s-1EOF\s0, but not closed. | |
326 | .Sp | |
327 | If an invoked event handler aborts parsing by calling \f(CW$p\fR\->eof, | |
328 | then \f(CW$p\fR\->\fIparse_file()\fR may not have read the entire file. | |
329 | .Sp | |
330 | On systems with multi-byte line terminators, the values passed for the | |
331 | offset and length argspecs may be too low if \fIparse_file()\fR is called on | |
332 | a file handle that is not in binary mode. | |
333 | .Sp | |
334 | If a filename is passed in, then \fIparse_file()\fR will open the file in | |
335 | binary mode. | |
336 | .IP "$p\->eof" 4 | |
337 | .IX Item "$p->eof" | |
338 | Signals the end of the \s-1HTML\s0 document. Calling the \f(CW$p\fR\->eof method | |
339 | outside a handler callback will flush any remaining buffered text | |
340 | (which triggers the \f(CW\*(C`text\*(C'\fR event if there is any remaining text). | |
341 | .Sp | |
342 | Calling \f(CW$p\fR\->eof inside a handler will terminate parsing at that point | |
343 | and cause \f(CW$p\fR\->parse to return a \s-1FALSE\s0 value. This also terminates | |
344 | parsing by \f(CW$p\fR\->\fIparse_file()\fR. | |
345 | .Sp | |
346 | After \f(CW$p\fR\->eof has been called, the \fIparse()\fR and \fIparse_file()\fR methods | |
347 | can be invoked to feed new documents with the parser object. | |
348 | .Sp | |
349 | The return value from \fIeof()\fR is a reference to the parser object. | |
350 | .PP | |
351 | Most parser options are controlled by boolean attributes. | |
352 | Each boolean attribute is enabled by calling the corresponding method | |
353 | with a \s-1TRUE\s0 argument and disabled with a \s-1FALSE\s0 argument. The | |
354 | attribute value is left unchanged if no argument is given. The return | |
355 | value from each method is the old attribute value. | |
356 | .PP | |
357 | Methods that can be used to get and/or set parser options are: | |
358 | .IP "$p\->strict_comment( [$bool] )" 4 | |
359 | .IX Item "$p->strict_comment( [$bool] )" | |
360 | By default, comments are terminated by the first occurrence of \*(L"\-\->\*(R". | |
361 | This is the behaviour of most popular browsers (like Netscape and | |
362 | \&\s-1MSIE\s0), but it is not correct according to the official \s-1HTML\s0 | |
363 | standard. Officially, you need an even number of \*(L"\-\-\*(R" tokens before | |
364 | the closing \*(L">\*(R" is recognized and there may not be anything but | |
365 | whitespace between an even and an odd \*(L"\-\-\*(R". | |
366 | .Sp | |
367 | The official behaviour is enabled by enabling this attribute. | |
368 | .IP "$p\->strict_names( [$bool] )" 4 | |
369 | .IX Item "$p->strict_names( [$bool] )" | |
370 | By default, almost anything is allowed in tag and attribute names. | |
371 | This is the behaviour of most popular browsers and allows us to parse | |
372 | some broken tags with invalid attr values like: | |
373 | .Sp | |
374 | .Vb 1 | |
375 | \& <IMG SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0> | |
376 | .Ve | |
377 | .Sp | |
378 | By default, \*(L"\s-1LIST\s0]\*(R" is parsed as a boolean attribute, not as | |
379 | part of the \s-1ALT\s0 value as was clearly intended. This is also what | |
380 | Netscape sees. | |
381 | .Sp | |
382 | The official behaviour is enabled by enabling this attribute. If | |
383 | enabled, it will cause the tag above to be reported as text | |
384 | since \*(L"\s-1LIST\s0]\*(R" is not a legal attribute name. | |
385 | .ie n .IP "$p\->boolean_attribute_value( $val )" 4 | |
386 | .el .IP "$p\->boolean_attribute_value( \f(CW$val\fR )" 4 | |
387 | .IX Item "$p->boolean_attribute_value( $val )" | |
388 | This method sets the value reported for boolean attributes inside \s-1HTML\s0 | |
389 | start tags. By default, the name of the attribute is also used as its | |
390 | value. This affects the values reported for \f(CW\*(C`tokens\*(C'\fR and \f(CW\*(C`attr\*(C'\fR | |
391 | argspecs. | |
392 | .IP "$p\->xml_mode( [$bool] )" 4 | |
393 | .IX Item "$p->xml_mode( [$bool] )" | |
394 | Enabling this attribute changes the parser to allow some \s-1XML\s0 | |
395 | constructs such as \fIempty element tags\fR and \fI\s-1XML\s0 processing | |
396 | instructions\fR. It disables forcing tag and attribute names to lower | |
397 | case when they are reported by the \f(CW\*(C`tagname\*(C'\fR and \f(CW\*(C`attr\*(C'\fR argspecs, | |
398 | and suppress special treatment of elements that are parsed as \s-1CDATA\s0 | |
399 | for \s-1HTML\s0. | |
400 | .Sp | |
401 | \&\fIEmpty element tags\fR look like start tags, but end with the character | |
402 | sequence \*(L"/>\*(R". When recognized by \f(CW\*(C`HTML::Parser\*(C'\fR they cause an | |
403 | artificial end event in addition to the start event. The \f(CW\*(C`text\*(C'\fR for | |
404 | the artificial end event will be empty and the \f(CW\*(C`tokenpos\*(C'\fR array will | |
405 | be undefined even though the only element in the token array will have | |
406 | the correct tag name. | |
407 | .Sp | |
408 | \&\fI\s-1XML\s0 processing instructions\fR are terminated by \*(L"?>\*(R" instead of a | |
409 | simple \*(L">\*(R" as is the case for \s-1HTML\s0. | |
410 | .IP "$p\->unbroken_text( [$bool] )" 4 | |
411 | .IX Item "$p->unbroken_text( [$bool] )" | |
412 | By default, blocks of text are given to the text handler as soon as | |
413 | possible (but the parser makes sure to always break text at the | |
414 | boundary between whitespace and non-whitespace so single words and | |
415 | entities always can be decoded safely). This might create breaks that | |
416 | make it hard to do transformations on the text. When this attribute is | |
417 | enabled, blocks of text are always reported in one piece. This will | |
418 | delay the text event until the following (non\-text) event has been | |
419 | recognized by the parser. | |
420 | .Sp | |
421 | Note that the \f(CW\*(C`offset\*(C'\fR argspec will give you the offset of the first | |
422 | segment of text and \f(CW\*(C`length\*(C'\fR is the combined length of the segments. | |
423 | Since there might be ignored tags in between, these numbers can't be | |
424 | used to directly index in the original document file. | |
425 | .IP "$p\->marked_sections( [$bool] )" 4 | |
426 | .IX Item "$p->marked_sections( [$bool] )" | |
427 | By default, section markings like <![CDATA[...]]> are treated like | |
428 | ordinary text. When this attribute is enabled section markings are | |
429 | honoured. | |
430 | .Sp | |
431 | There are currently no events associated with the marked section | |
432 | markup, but the text can be returned as \f(CW\*(C`skipped_text\*(C'\fR. | |
433 | .IP "$p\->attr_encoded( [$bool] )" 4 | |
434 | .IX Item "$p->attr_encoded( [$bool] )" | |
435 | By default, the \f(CW\*(C`attr\*(C'\fR and \f(CW@attr\fR argspecs will have general | |
436 | entities for attribute values decoded. Enabling this attribute leaves | |
437 | entities alone. | |
438 | .IP "$p\->case_sensitive( [$bool] )" 4 | |
439 | .IX Item "$p->case_sensitive( [$bool] )" | |
440 | By default, tagnames and attribute names are down\-cased. Enabling this | |
441 | attribute leave them as found in the \s-1HTML\s0 source document. | |
442 | .PP | |
443 | As markup and text is recognized, handlers are invoked. The following | |
444 | method is used to set up handlers for different events: | |
445 | .IP "$p\->handler( event => \e&subroutine, argspec )" 4 | |
446 | .IX Item "$p->handler( event => &subroutine, argspec )" | |
447 | .PD 0 | |
448 | .IP "$p\->handler( event => method_name, argspec )" 4 | |
449 | .IX Item "$p->handler( event => method_name, argspec )" | |
450 | .IP "$p\->handler( event => \e@accum, argspec )" 4 | |
451 | .IX Item "$p->handler( event => @accum, argspec )" | |
452 | .ie n .IP "$p\->handler( event => """" );" 4 | |
453 | .el .IP "$p\->handler( event => ``'' );" 4 | |
454 | .IX Item "$p->handler( event => """" );" | |
455 | .IP "$p\->handler( event => undef );" 4 | |
456 | .IX Item "$p->handler( event => undef );" | |
457 | .IP "$p\->handler( event );" 4 | |
458 | .IX Item "$p->handler( event );" | |
459 | .PD | |
460 | This method assigns a subroutine, method, or array to handle an event. | |
461 | .Sp | |
462 | Event is one of \f(CW\*(C`text\*(C'\fR, \f(CW\*(C`start\*(C'\fR, \f(CW\*(C`end\*(C'\fR, \f(CW\*(C`declaration\*(C'\fR, \f(CW\*(C`comment\*(C'\fR, | |
463 | \&\f(CW\*(C`process\*(C'\fR, \f(CW\*(C`start_document\*(C'\fR, \f(CW\*(C`end_document\*(C'\fR or \f(CW\*(C`default\*(C'\fR. | |
464 | .Sp | |
465 | \&\fISubroutine\fR is a reference to a subroutine which is called to handle | |
466 | the event. | |
467 | .Sp | |
468 | \&\fIMethod_name\fR is the name of a method of \f(CW$p\fR which is called to handle | |
469 | the event. | |
470 | .Sp | |
471 | \&\fIAccum\fR is a array that will hold the event information as | |
472 | sub\-arrays. | |
473 | .Sp | |
474 | If the second argument is "", the event is ignored. | |
475 | If it is undef, the default handler is invoked for the event. | |
476 | .Sp | |
477 | \&\fIArgspec\fR is a string that describes the information to be reported | |
478 | for the event. Any requested information that does not apply to a | |
479 | specific event is passed as \f(CW\*(C`undef\*(C'\fR. If argspec is omitted, then it | |
480 | is left unchanged since last update. | |
481 | .Sp | |
482 | The return value from \f(CW$p\fR\->handle is the old callback routine or a | |
483 | reference to the accumulator array. | |
484 | .Sp | |
485 | Any return values from handler callback routines/methods are always | |
486 | ignored. A handler callback can request parsing to be aborted by | |
487 | invoking the \f(CW$p\fR\->eof method. A handler callback is not allowed to | |
488 | invoke the \f(CW$p\fR\->\fIparse()\fR or \f(CW$p\fR\->\fIparse_file()\fR method. An exception will | |
489 | be raised if it tries. | |
490 | .Sp | |
491 | Examples: | |
492 | .Sp | |
493 | .Vb 1 | |
494 | \& $p->handler(start => "start", 'self, attr, attrseq, text' ); | |
495 | .Ve | |
496 | .Sp | |
497 | This causes the \*(L"start\*(R" method of object \f(CW$p\fR to be called for 'start' events. | |
498 | The callback signature is \f(CW$p\fR\->start(\e%attr, \e@attr_seq, \f(CW$text\fR). | |
499 | .Sp | |
500 | .Vb 1 | |
501 | \& $p->handler(start => \e&start, 'attr, attrseq, text' ); | |
502 | .Ve | |
503 | .Sp | |
504 | This causes subroutine \fIstart()\fR to be called for 'start' events. | |
505 | The callback signature is start(\e%attr, \e@attr_seq, \f(CW$text\fR). | |
506 | .Sp | |
507 | .Vb 1 | |
508 | \& $p->handler(start => \e@accum, '"S", attr, attrseq, text' ); | |
509 | .Ve | |
510 | .Sp | |
511 | This causes 'start' event information to be saved in \f(CW@accum\fR. | |
512 | The array elements will be ['S', \e%attr, \e@attr_seq, \f(CW$text\fR]. | |
513 | .Sp | |
514 | .Vb 1 | |
515 | \& $p->handler(start => ""); | |
516 | .Ve | |
517 | .Sp | |
518 | This causes 'start' events to be ignored. It also supresses | |
519 | invokations of any default handler for start events. It is in most | |
520 | cases equivalent to \f(CW$p\fR\->handler(start => sub {}), but is more | |
521 | efficient. It is different from the empty-sub-handler in that | |
522 | \&\f(CW\*(C`skipped_text\*(C'\fR is not reset by it. | |
523 | .Sp | |
524 | .Vb 1 | |
525 | \& $p->handler(start => undef); | |
526 | .Ve | |
527 | .Sp | |
528 | This causes no handler to be assosiated with start events. | |
529 | If there is a default handler it will be invoked. | |
530 | .PP | |
531 | Filters based on tags can be set up to limit the number of events | |
532 | reported. The main bottleneck during parsing is often the huge number | |
533 | of callbacks made from the parser. Applying filters can improve | |
534 | performance significantly. | |
535 | .PP | |
536 | The following methods control filters: | |
537 | .IP "$p\->ignore_tags( \s-1TAG\s0, ... )" 4 | |
538 | .IX Item "$p->ignore_tags( TAG, ... )" | |
539 | Any \f(CW\*(C`start\*(C'\fR and \f(CW\*(C`end\*(C'\fR events involving any of the tags given are | |
540 | suppressed. | |
541 | .IP "$p\->report_tags( \s-1TAG\s0, ... )" 4 | |
542 | .IX Item "$p->report_tags( TAG, ... )" | |
543 | Any \f(CW\*(C`start\*(C'\fR and \f(CW\*(C`end\*(C'\fR events involving any of the tags \fInot\fR given | |
544 | are suppressed. | |
545 | .IP "$p\->ignore_elements( \s-1TAG\s0, ... )" 4 | |
546 | .IX Item "$p->ignore_elements( TAG, ... )" | |
547 | Both the \f(CW\*(C`start\*(C'\fR and the \f(CW\*(C`end\*(C'\fR event as well as any events that | |
548 | would be reported in between are suppressed. The ignored elements can | |
549 | contain nested occurences of itself. Example: | |
550 | .Sp | |
551 | .Vb 1 | |
552 | \& $p->ignore_elements(qw(script style)); | |
553 | .Ve | |
554 | .Sp | |
555 | The \f(CW\*(C`script\*(C'\fR and \f(CW\*(C`style\*(C'\fR tags will always nest properly since their | |
556 | content is parsed in \s-1CDATA\s0 mode. For most other tags | |
557 | \&\f(CW\*(C`ignore_elements\*(C'\fR must be used with caution since \s-1HTML\s0 is often not | |
558 | \&\fIwell formed\fR. | |
559 | .Sh "Argspec" | |
560 | .IX Subsection "Argspec" | |
561 | Argspec is a string containing a comma separated list that describes | |
562 | the information reported by the event. The following argspec | |
563 | identifier names can be used: | |
564 | .ie n .IP """self""" 4 | |
565 | .el .IP "\f(CWself\fR" 4 | |
566 | .IX Item "self" | |
567 | Self causes the current object to be passed to the handler. If the | |
568 | handler is a method, this must be the first element in the argspec. | |
569 | .Sp | |
570 | An alternative to passing self as an argspec is to register closures | |
571 | that capture \f(CW$self\fR by themselves as handlers. Unfortunately this | |
572 | creates a circular references which prevents the HTML::Parser object | |
573 | from being garbage collected. Using the \f(CW\*(C`self\*(C'\fR argspec avoids this | |
574 | problem. | |
575 | .ie n .IP """tokens""" 4 | |
576 | .el .IP "\f(CWtokens\fR" 4 | |
577 | .IX Item "tokens" | |
578 | Tokens causes a reference to an array of token strings to be passed. | |
579 | The strings are exactly as they were found in the original text, | |
580 | no decoding or case changes are applied. | |
581 | .Sp | |
582 | For \f(CW\*(C`declaration\*(C'\fR events, the array contains each word, comment, and | |
583 | delimited string starting with the declaration type. | |
584 | .Sp | |
585 | For \f(CW\*(C`comment\*(C'\fR events, this contains each sub\-comment. If | |
586 | \&\f(CW$p\fR\->strict_comments is disabled, there will be only one sub\-comment. | |
587 | .Sp | |
588 | For \f(CW\*(C`start\*(C'\fR events, this contains the original tag name followed by | |
589 | the attribute name/value pairs. The value of boolean attributes will | |
590 | be either the value set by \f(CW$p\fR\->boolean_attribute_value or the | |
591 | attribute name if no value has been set by | |
592 | \&\f(CW$p\fR\->boolean_attribute_value. | |
593 | .Sp | |
594 | For \f(CW\*(C`end\*(C'\fR events, this contains the original tag name (always one token). | |
595 | .Sp | |
596 | For \f(CW\*(C`process\*(C'\fR events, this contains the process instructions (always one | |
597 | token). | |
598 | .Sp | |
599 | This passes \f(CW\*(C`undef\*(C'\fR for \f(CW\*(C`text\*(C'\fR events. | |
600 | .ie n .IP """tokenpos""" 4 | |
601 | .el .IP "\f(CWtokenpos\fR" 4 | |
602 | .IX Item "tokenpos" | |
603 | Tokenpos causes a reference to an array of token positions to be | |
604 | passed. For each string that appears in \f(CW\*(C`tokens\*(C'\fR, this array | |
605 | contains two numbers. The first number is the offset of the start of | |
606 | the token in the original \f(CW\*(C`text\*(C'\fR and the second number is the length | |
607 | of the token. | |
608 | .Sp | |
609 | Boolean attributes in a \f(CW\*(C`start\*(C'\fR event will have (0,0) for the | |
610 | attribute value offset and length. | |
611 | .Sp | |
612 | This passes undef if there are no tokens in the event (e.g., \f(CW\*(C`text\*(C'\fR) | |
613 | and for artifical \f(CW\*(C`end\*(C'\fR events triggered by empty element tags. | |
614 | .Sp | |
615 | If you are using these offsets and lengths to modify \f(CW\*(C`text\*(C'\fR, you | |
616 | should either work from right to left, or be very careful to calculate | |
617 | the changes to the offsets. | |
618 | .ie n .IP """token0""" 4 | |
619 | .el .IP "\f(CWtoken0\fR" 4 | |
620 | .IX Item "token0" | |
621 | Token0 causes the original text of the first token string to be | |
622 | passed. This should always be the same as \f(CW$tokens\fR\->[0]. | |
623 | .Sp | |
624 | For \f(CW\*(C`declaration\*(C'\fR events, this is the declaration type. | |
625 | .Sp | |
626 | For \f(CW\*(C`start\*(C'\fR and \f(CW\*(C`end\*(C'\fR events, this is the tag name. | |
627 | .Sp | |
628 | For \f(CW\*(C`process\*(C'\fR and non-strict \f(CW\*(C`comment\*(C'\fR events, this is everything | |
629 | inside the tag. | |
630 | .Sp | |
631 | This passes undef if there are no tokens in the event. | |
632 | .ie n .IP """tagname""" 4 | |
633 | .el .IP "\f(CWtagname\fR" 4 | |
634 | .IX Item "tagname" | |
635 | This is the element name (or \fIgeneric identifier\fR in \s-1SGML\s0 jargon) for | |
636 | start and end tags. Since \s-1HTML\s0 is case insensitive this name is | |
637 | forced to lower case to ease string matching. | |
638 | .Sp | |
639 | Since \s-1XML\s0 is case sensitive, the tagname case is not changed when | |
640 | \&\f(CW\*(C`xml_mode\*(C'\fR is enabled. Same happens if the \f(CW\*(C`case_sensitive\*(C'\fR attribute | |
641 | is set. | |
642 | .Sp | |
643 | The declaration type of declaration elements is also passed as a tagname, | |
644 | even if that is a bit strange. | |
645 | In fact, in the current implementation tagname is | |
646 | identical to \f(CW\*(C`token0\*(C'\fR except that the name may be forced to lower case. | |
647 | .ie n .IP """tag""" 4 | |
648 | .el .IP "\f(CWtag\fR" 4 | |
649 | .IX Item "tag" | |
650 | Same as \f(CW\*(C`tagname\*(C'\fR, but prefixed with \*(L"/\*(R" if it belongs to an \f(CW\*(C`end\*(C'\fR | |
651 | event and \*(L"!\*(R" for a declaration. The \f(CW\*(C`tag\*(C'\fR does not have any prefix | |
652 | for \f(CW\*(C`start\*(C'\fR events, and is in this case identical to \f(CW\*(C`tagname\*(C'\fR. | |
653 | .ie n .IP """attr""" 4 | |
654 | .el .IP "\f(CWattr\fR" 4 | |
655 | .IX Item "attr" | |
656 | Attr causes a reference to a hash of attribute name/value pairs to be | |
657 | passed. | |
658 | .Sp | |
659 | Boolean attributes' values are either the value set by | |
660 | \&\f(CW$p\fR\->boolean_attribute_value or the attribute name if no value has been | |
661 | set by \f(CW$p\fR\->boolean_attribute_value. | |
662 | .Sp | |
663 | This passes undef except for \f(CW\*(C`start\*(C'\fR events. | |
664 | .Sp | |
665 | Unless \f(CW\*(C`xml_mode\*(C'\fR or \f(CW\*(C`case_sensitive\*(C'\fR is enabled, the attribute | |
666 | names are forced to lower case. | |
667 | .Sp | |
668 | General entities are decoded in the attribute values and | |
669 | one layer of matching quotes enclosing the attribute values are removed. | |
670 | .ie n .IP """attrseq""" 4 | |
671 | .el .IP "\f(CWattrseq\fR" 4 | |
672 | .IX Item "attrseq" | |
673 | Attrseq causes a reference to an array of attribute names to be | |
674 | passed. This can be useful if you want to walk the \f(CW\*(C`attr\*(C'\fR hash in | |
675 | the original sequence. | |
676 | .Sp | |
677 | This passes undef except for \f(CW\*(C`start\*(C'\fR events. | |
678 | .Sp | |
679 | Unless \f(CW\*(C`xml_mode\*(C'\fR or \f(CW\*(C`case_sensitive\*(C'\fR is enabled, the attribute | |
680 | names are forced to lower case. | |
681 | .ie n .IP "@attr" 4 | |
682 | .el .IP "\f(CW@attr\fR" 4 | |
683 | .IX Item "@attr" | |
684 | Basically same as \f(CW\*(C`attr\*(C'\fR, but keys and values are passed as | |
685 | individual arguments and the original sequence of the attributes is | |
686 | kept. The parameters passed will be the same as the \f(CW@attr\fR calculated | |
687 | here: | |
688 | .Sp | |
689 | .Vb 1 | |
690 | \& @attr = map { $_ => $attr->{$_} } @$attrseq; | |
691 | .Ve | |
692 | .Sp | |
693 | assuming \f(CW$attr\fR and \f(CW$attrseq\fR here are the hash and array passed as the | |
694 | result of \f(CW\*(C`attr\*(C'\fR and \f(CW\*(C`attrseq\*(C'\fR argspecs. | |
695 | .Sp | |
696 | This pass no values for events besides \f(CW\*(C`start\*(C'\fR. | |
697 | .ie n .IP """text""" 4 | |
698 | .el .IP "\f(CWtext\fR" 4 | |
699 | .IX Item "text" | |
700 | Text causes the source text (including markup element delimiters) to be | |
701 | passed. | |
702 | .ie n .IP """dtext""" 4 | |
703 | .el .IP "\f(CWdtext\fR" 4 | |
704 | .IX Item "dtext" | |
705 | Dtext causes the decoded text to be passed. General entities are | |
706 | automatically decoded unless the event was inside a \s-1CDATA\s0 section or | |
707 | was between literal start and end tags (\f(CW\*(C`script\*(C'\fR, \f(CW\*(C`style\*(C'\fR, \f(CW\*(C`xmp\*(C'\fR, | |
708 | and \f(CW\*(C`plaintext\*(C'\fR). | |
709 | .Sp | |
710 | The Unicode character set is assumed for entity decoding. With perl | |
711 | version < 5.7.1 only the Latin1 range is supported, and entities for | |
712 | characters outside the 0..255 range is left unchanged. | |
713 | .Sp | |
714 | This passes undef except for \f(CW\*(C`text\*(C'\fR events. | |
715 | .ie n .IP """is_cdata""" 4 | |
716 | .el .IP "\f(CWis_cdata\fR" 4 | |
717 | .IX Item "is_cdata" | |
718 | Is_cdata causes a \s-1TRUE\s0 value to be passed if the event is inside a \s-1CDATA\s0 | |
719 | section or is between literal start and end tags (\f(CW\*(C`script\*(C'\fR, | |
720 | \&\f(CW\*(C`style\*(C'\fR, \f(CW\*(C`xmp\*(C'\fR, and \f(CW\*(C`plaintext\*(C'\fR). | |
721 | .Sp | |
722 | When the flag is \s-1FALSE\s0 for a text event, then you should normally | |
723 | either use \f(CW\*(C`dtext\*(C'\fR or decode the entities yourself before the text is | |
724 | processed further. | |
725 | .ie n .IP """skipped_text""" 4 | |
726 | .el .IP "\f(CWskipped_text\fR" 4 | |
727 | .IX Item "skipped_text" | |
728 | Skipped_text returns the concatenated text of all the events that has | |
729 | been skipped since the last time an event was reported. Events might | |
730 | be skipped because no handler is registered for them or because some | |
731 | filter applies. Skipped text also include marked section markup, | |
732 | since there is no events that can catch them. | |
733 | .Sp | |
734 | If an \f(CW""\fR\-handler is registered for an event, then the text for this | |
735 | event is not included in \f(CW\*(C`skipped_text\*(C'\fR. Skipped text both before | |
736 | and after the \f(CW""\fR\-event is included in the next reported | |
737 | \&\f(CW\*(C`skipped_text\*(C'\fR. | |
738 | .ie n .IP """offset""" 4 | |
739 | .el .IP "\f(CWoffset\fR" 4 | |
740 | .IX Item "offset" | |
741 | Offset causes the byte position in the \s-1HTML\s0 document of the start of | |
742 | the event to be passed. The first byte in the document is 0. | |
743 | .ie n .IP """length""" 4 | |
744 | .el .IP "\f(CWlength\fR" 4 | |
745 | .IX Item "length" | |
746 | Length causes the number of bytes of the source text of the event to | |
747 | be passed. | |
748 | .ie n .IP """offset_end""" 4 | |
749 | .el .IP "\f(CWoffset_end\fR" 4 | |
750 | .IX Item "offset_end" | |
751 | Offset_end causes the byte position in the \s-1HTML\s0 document of the end of | |
752 | the event to be passed. This is the same as \f(CW\*(C`offset\*(C'\fR + \f(CW\*(C`length\*(C'\fR. | |
753 | .ie n .IP """event""" 4 | |
754 | .el .IP "\f(CWevent\fR" 4 | |
755 | .IX Item "event" | |
756 | Event causes the event name to be passed. | |
757 | .Sp | |
758 | The event name is one of \f(CW\*(C`text\*(C'\fR, \f(CW\*(C`start\*(C'\fR, \f(CW\*(C`end\*(C'\fR, \f(CW\*(C`declaration\*(C'\fR, | |
759 | \&\f(CW\*(C`comment\*(C'\fR, \f(CW\*(C`process\*(C'\fR, \f(CW\*(C`start_document\*(C'\fR, \f(CW\*(C`end_document\*(C'\fR or \f(CW\*(C`default\*(C'\fR. | |
760 | .ie n .IP """line""" 4 | |
761 | .el .IP "\f(CWline\fR" 4 | |
762 | .IX Item "line" | |
763 | Line causes the line number of the start of the event to be passed. | |
764 | The first line in the document is 1. Line counting doesn't start | |
765 | until at least one handler requests this value to be reported. | |
766 | .ie n .IP """column""" 4 | |
767 | .el .IP "\f(CWcolumn\fR" 4 | |
768 | .IX Item "column" | |
769 | Column causes the column number of the start of the event to be passed. | |
770 | The first column on a line is 0. | |
771 | .ie n .IP "'...'" 4 | |
772 | .el .IP "\f(CW'...'\fR" 4 | |
773 | .IX Item "'...'" | |
774 | A literal string of 0 to 255 characters enclosed | |
775 | in single (') or double (") quotes is passed as entered. | |
776 | .ie n .IP """undef""" 4 | |
777 | .el .IP "\f(CWundef\fR" 4 | |
778 | .IX Item "undef" | |
779 | Pass an undefined value. Useful as padding where the same handler | |
780 | routine is registered for multiple events. | |
781 | .PP | |
782 | The whole argspec string can be wrapped up in \f(CW'@{...}'\fR to signal | |
783 | that resulting event array should be flatten. This only makes a | |
784 | difference if an array reference is used as the handler target. | |
785 | Consider this example: | |
786 | .PP | |
787 | .Vb 2 | |
788 | \& $p->handler(text => [], 'text'); | |
789 | \& $p->handler(text => [], '@{text}']); | |
790 | .Ve | |
791 | .PP | |
792 | With two text events; \f(CW"foo"\fR, \f(CW"bar"\fR; then the first one will end | |
793 | up with [[\*(L"foo\*(R"], [\*(L"bar\*(R"]] and the second one with [\*(L"foo\*(R", \*(L"bar\*(R"] in | |
794 | the handler target array. | |
795 | .Sh "Events" | |
796 | .IX Subsection "Events" | |
797 | Handlers for the following events can be registered: | |
798 | .ie n .IP """text""" 4 | |
799 | .el .IP "\f(CWtext\fR" 4 | |
800 | .IX Item "text" | |
801 | This event is triggered when plain text (characters) is recognized. | |
802 | The text may contain multiple lines. A sequence of text may be broken | |
803 | between several text events unless \f(CW$p\fR\->unbroken_text is enabled. | |
804 | .Sp | |
805 | The parser will make sure that it does not break a word or a sequence | |
806 | of whitespace between two text events. | |
807 | .ie n .IP """start""" 4 | |
808 | .el .IP "\f(CWstart\fR" 4 | |
809 | .IX Item "start" | |
810 | This event is triggered when a start tag is recognized. | |
811 | .Sp | |
812 | Example: | |
813 | .Sp | |
814 | .Vb 1 | |
815 | \& <A HREF="http://www.perl.com/"> | |
816 | .Ve | |
817 | .ie n .IP """end""" 4 | |
818 | .el .IP "\f(CWend\fR" 4 | |
819 | .IX Item "end" | |
820 | This event is triggered when an end tag is recognized. | |
821 | .Sp | |
822 | Example: | |
823 | .Sp | |
824 | .Vb 1 | |
825 | \& </A> | |
826 | .Ve | |
827 | .ie n .IP """declaration""" 4 | |
828 | .el .IP "\f(CWdeclaration\fR" 4 | |
829 | .IX Item "declaration" | |
830 | This event is triggered when a \fImarkup declaration\fR is recognized. | |
831 | .Sp | |
832 | For typical \s-1HTML\s0 documents, the only declaration you are | |
833 | likely to find is <!DOCTYPE ...>. | |
834 | .Sp | |
835 | Example: | |
836 | .Sp | |
837 | .Vb 2 | |
838 | \& <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" | |
839 | \& "http://www.w3.org/TR/html40/strict.dtd"> | |
840 | .Ve | |
841 | .Sp | |
842 | DTDs inside <!DOCTYPE ...> will confuse HTML::Parser. | |
843 | .ie n .IP """comment""" 4 | |
844 | .el .IP "\f(CWcomment\fR" 4 | |
845 | .IX Item "comment" | |
846 | This event is triggered when a markup comment is recognized. | |
847 | .Sp | |
848 | Example: | |
849 | .Sp | |
850 | .Vb 1 | |
851 | \& <!-- This is a comment -- -- So is this --> | |
852 | .Ve | |
853 | .ie n .IP """process""" 4 | |
854 | .el .IP "\f(CWprocess\fR" 4 | |
855 | .IX Item "process" | |
856 | This event is triggered when a processing instructions markup is | |
857 | recognized. | |
858 | .Sp | |
859 | The format and content of processing instructions is system and | |
860 | application dependent. | |
861 | .Sp | |
862 | Examples: | |
863 | .Sp | |
864 | .Vb 2 | |
865 | \& <? HTML processing instructions > | |
866 | \& <? XML processing instructions ?> | |
867 | .Ve | |
868 | .ie n .IP """start_document""" 4 | |
869 | .el .IP "\f(CWstart_document\fR" 4 | |
870 | .IX Item "start_document" | |
871 | This event is triggered before any other events for a new document. A | |
872 | handler for it can be used to initialize stuff. There is no document | |
873 | text associated with this event. | |
874 | .ie n .IP """end_document""" 4 | |
875 | .el .IP "\f(CWend_document\fR" 4 | |
876 | .IX Item "end_document" | |
877 | This event is triggered when \f(CW$p\fR\->eof called and after any remaining | |
878 | text is flushed. There is no document text associated with this event. | |
879 | .ie n .IP """default""" 4 | |
880 | .el .IP "\f(CWdefault\fR" 4 | |
881 | .IX Item "default" | |
882 | This event is triggered for events that do not have a specific | |
883 | handler. You can set up a handler for this event to catch stuff you | |
884 | did not want to catch explicitly. | |
885 | .SH "VERSION 2 COMPATIBILITY" | |
886 | .IX Header "VERSION 2 COMPATIBILITY" | |
887 | When an \f(CW\*(C`HTML::Parser\*(C'\fR object is constructed with no arguments, a set | |
888 | of handlers is automatically provided that is compatible with the old | |
889 | HTML::Parser version 2 callback methods. | |
890 | .PP | |
891 | This is equivalent to the following method calls: | |
892 | .PP | |
893 | .Vb 14 | |
894 | \& $p->handler(start => "start", "self, tagname, attr, attrseq, text"); | |
895 | \& $p->handler(end => "end", "self, tagname, text"); | |
896 | \& $p->handler(text => "text", "self, text, is_cdata"); | |
897 | \& $p->handler(process => "process", "self, token0, text"); | |
898 | \& $p->handler(comment => | |
899 | \& sub { | |
900 | \& my($self, $tokens) = @_; | |
901 | \& for (@$tokens) {$self->comment($_);}}, | |
902 | \& "self, tokens"); | |
903 | \& $p->handler(declaration => | |
904 | \& sub { | |
905 | \& my $self = shift; | |
906 | \& $self->declaration(substr($_[0], 2, -1));}, | |
907 | \& "self, text"); | |
908 | .Ve | |
909 | .PP | |
910 | Setup of these handlers can also be requested with the \*(L"api_version => | |
911 | 2\*(R" constructor option. | |
912 | .SH "SUBCLASSING" | |
913 | .IX Header "SUBCLASSING" | |
914 | The \f(CW\*(C`HTML::Parser\*(C'\fR class is subclassable. Parser objects are plain | |
915 | hashes and \f(CW\*(C`HTML::Parser\*(C'\fR reserves only hash keys that start with | |
916 | \&\*(L"_hparser\*(R". The parser state can be set up by invoking the \fIinit()\fR | |
917 | method which takes the same arguments as \fInew()\fR. | |
918 | .SH "EXAMPLES" | |
919 | .IX Header "EXAMPLES" | |
920 | The first simple example shows how you might strip out comments from | |
921 | an \s-1HTML\s0 document. We achieve this by setting up a comment handler that | |
922 | does nothing and a default handler that will print out anything else: | |
923 | .PP | |
924 | .Vb 4 | |
925 | \& use HTML::Parser; | |
926 | \& HTML::Parser->new(default_h => [sub { print shift }, 'text'], | |
927 | \& comment_h => [""], | |
928 | \& )->parse_file(shift || die) || die $!; | |
929 | .Ve | |
930 | .PP | |
931 | An alternative implementation is: | |
932 | .PP | |
933 | .Vb 5 | |
934 | \& use HTML::Parser; | |
935 | \& HTML::Parser->new(end_document_h => [sub { print shift }, | |
936 | \& 'skipped_text'], | |
937 | \& comment_h => [""], | |
938 | \& )->parse_file(shift || die) || die $!; | |
939 | .Ve | |
940 | .PP | |
941 | This will in most cases be much more efficient since only a single | |
942 | callback will be made. | |
943 | .PP | |
944 | The next example prints out the text that is inside the <title> | |
945 | element of an \s-1HTML\s0 document. Here we start by setting up a start | |
946 | handler. When it sees the title start tag it enables a text handler | |
947 | that prints any text found and an end handler that will terminate | |
948 | parsing as soon as the title end tag is seen: | |
949 | .PP | |
950 | .Vb 1 | |
951 | \& use HTML::Parser (); | |
952 | .Ve | |
953 | .PP | |
954 | .Vb 8 | |
955 | \& sub start_handler | |
956 | \& { | |
957 | \& return if shift ne "title"; | |
958 | \& my $self = shift; | |
959 | \& $self->handler(text => sub { print shift }, "dtext"); | |
960 | \& $self->handler(end => sub { shift->eof if shift eq "title"; }, | |
961 | \& "tagname,self"); | |
962 | \& } | |
963 | .Ve | |
964 | .PP | |
965 | .Vb 4 | |
966 | \& my $p = HTML::Parser->new(api_version => 3); | |
967 | \& $p->handler( start => \e&start_handler, "tagname,self"); | |
968 | \& $p->parse_file(shift || die) || die $!; | |
969 | \& print "\en"; | |
970 | .Ve | |
971 | .PP | |
972 | More examples are found in the \*(L"eg/\*(R" directory of the \f(CW\*(C`HTML\-Parser\*(C'\fR | |
973 | distribution; the program \f(CW\*(C`hrefsub\*(C'\fR shows how you can edit all links | |
974 | found in a document and \f(CW\*(C`htextsub\*(C'\fR how to edid the text only; the | |
975 | program \f(CW\*(C`hstrip\*(C'\fR shows how you can strip out certain tags/elements | |
976 | and/or attributes; and the program \f(CW\*(C`htext\*(C'\fR show how to obtain the | |
977 | plain text, but not any script/style content. | |
978 | .SH "BUGS" | |
979 | .IX Header "BUGS" | |
980 | The <style> and <script> sections do not end with the first \*(L"</\*(R", but | |
981 | need the complete corresponding end tag. | |
982 | .PP | |
983 | When the \fIstrict_comment\fR option is enabled, we still recognize | |
984 | comments where there is something other than whitespace between even | |
985 | and odd \*(L"\-\-\*(R" markers. | |
986 | .PP | |
987 | Once \f(CW$p\fR\->boolean_attribute_value has been set, there is no way to | |
988 | restore the default behaviour. | |
989 | .PP | |
990 | There is currently no way to get both quote characters | |
991 | into the same literal argspec. | |
992 | .PP | |
993 | Empty tags, e.g. \*(L"<>\*(R" and \*(L"</>\*(R", are not recognized. \s-1SGML\s0 allows them | |
994 | to repeat the previous start tag or close the previous start tag | |
995 | respecitvely. | |
996 | .PP | |
997 | \&\s-1NET\s0 tags, e.g. \*(L"code/.../\*(R" are not recognized. This is an \s-1SGML\s0 | |
998 | shorthand for \*(L"<code>...</code>\*(R". | |
999 | .PP | |
1000 | Unclosed start or end tags, e.g. \*(L"<tt<b>...</b</tt>\*(R" are not | |
1001 | recognized. | |
1002 | .SH "DIAGNOSTICS" | |
1003 | .IX Header "DIAGNOSTICS" | |
1004 | The following messages may be produced by HTML::Parser. The notation | |
1005 | in this listing is the same as used in perldiag: | |
1006 | .IP "Not a reference to a hash" 4 | |
1007 | .IX Item "Not a reference to a hash" | |
1008 | (F) The object blessed into or subclassed from HTML::Parser is not a | |
1009 | hash as required by the HTML::Parser methods. | |
1010 | .ie n .IP "Bad signature in parser state object at %p" 4 | |
1011 | .el .IP "Bad signature in parser state object at \f(CW%p\fR" 4 | |
1012 | .IX Item "Bad signature in parser state object at %p" | |
1013 | (F) The _hparser_xs_state element does not refer to a valid state structure. | |
1014 | Something must have changed the internal value | |
1015 | stored in this hash element, or the memory has been overwritten. | |
1016 | .IP "_hparser_xs_state element is not a reference" 4 | |
1017 | .IX Item "_hparser_xs_state element is not a reference" | |
1018 | (F) The _hparser_xs_state element has been destroyed. | |
1019 | .IP "Can't find '_hparser_xs_state' element in HTML::Parser hash" 4 | |
1020 | .IX Item "Can't find '_hparser_xs_state' element in HTML::Parser hash" | |
1021 | (F) The _hparser_xs_state element is missing from the parser hash. | |
1022 | It was either deleted, or not created when the object was created. | |
1023 | .ie n .IP "\s-1API\s0 version %s\fR not supported by HTML::Parser \f(CW%s" 4 | |
1024 | .el .IP "\s-1API\s0 version \f(CW%s\fR not supported by HTML::Parser \f(CW%s\fR" 4 | |
1025 | .IX Item "API version %s not supported by HTML::Parser %s" | |
1026 | (F) The constructor option 'api_version' with an argument greater than | |
1027 | or equal to 4 is reserved for future extentions. | |
1028 | .IP "Bad constructor option '%s'" 4 | |
1029 | .IX Item "Bad constructor option '%s'" | |
1030 | (F) An unknown constructor option key was passed to the \fInew()\fR or | |
1031 | \&\fIinit()\fR methods. | |
1032 | .IP "Parse loop not allowed" 4 | |
1033 | .IX Item "Parse loop not allowed" | |
1034 | (F) A handler invoked the \fIparse()\fR or \fIparse_file()\fR method. | |
1035 | This is not permitted. | |
1036 | .IP "marked sections not supported" 4 | |
1037 | .IX Item "marked sections not supported" | |
1038 | (F) The \f(CW$p\fR\->\fImarked_sections()\fR method was invoked in a HTML::Parser | |
1039 | module that was compiled without support for marked sections. | |
1040 | .IP "Unknown boolean attribute (%d)" 4 | |
1041 | .IX Item "Unknown boolean attribute (%d)" | |
1042 | (F) Something is wrong with the internal logic that set up aliases for | |
1043 | boolean attributes. | |
1044 | .IP "Only code or array references allowed as handler" 4 | |
1045 | .IX Item "Only code or array references allowed as handler" | |
1046 | (F) The second argument for \f(CW$p\fR\->handler must be either a subroutine | |
1047 | reference, then name of a subroutine or method, or a reference to an | |
1048 | array. | |
1049 | .ie n .IP "No handler for %s events" 4 | |
1050 | .el .IP "No handler for \f(CW%s\fR events" 4 | |
1051 | .IX Item "No handler for %s events" | |
1052 | (F) The first argument to \f(CW$p\fR\->handler must be a valid event name; i.e. one | |
1053 | of \*(L"start\*(R", \*(L"end\*(R", \*(L"text\*(R", \*(L"process\*(R", \*(L"declaration\*(R" or \*(L"comment\*(R". | |
1054 | .ie n .IP "Unrecognized identifier %s in argspec" 4 | |
1055 | .el .IP "Unrecognized identifier \f(CW%s\fR in argspec" 4 | |
1056 | .IX Item "Unrecognized identifier %s in argspec" | |
1057 | (F) The identifier is not a known argspec name. | |
1058 | Use one of the names mentioned in the argspec section above. | |
1059 | .IP "Literal string is longer than 255 chars in argspec" 4 | |
1060 | .IX Item "Literal string is longer than 255 chars in argspec" | |
1061 | (F) The current implementation limits the length of literals in | |
1062 | an argspec to 255 characters. Make the literal shorter. | |
1063 | .IP "Backslash reserved for literal string in argspec" 4 | |
1064 | .IX Item "Backslash reserved for literal string in argspec" | |
1065 | (F) The backslash character \*(L"\e\*(R" is not allowed in argspec literals. | |
1066 | It is reserved to permit quoting inside a literal in a later version. | |
1067 | .IP "Unterminated literal string in argspec" 4 | |
1068 | .IX Item "Unterminated literal string in argspec" | |
1069 | (F) The terminating quote character for a literal was not found. | |
1070 | .IP "Bad argspec (%s)" 4 | |
1071 | .IX Item "Bad argspec (%s)" | |
1072 | (F) Only identifier names, literals, spaces and commas | |
1073 | are allowed in argspecs. | |
1074 | .IP "Missing comma separator in argspec" 4 | |
1075 | .IX Item "Missing comma separator in argspec" | |
1076 | (F) Identifiers in an argspec must be separated with \*(L",\*(R". | |
1077 | .SH "SEE ALSO" | |
1078 | .IX Header "SEE ALSO" | |
1079 | HTML::Entities, HTML::PullParser, HTML::TokeParser, HTML::HeadParser, | |
1080 | HTML::LinkExtor, HTML::Form | |
1081 | .PP | |
1082 | HTML::TreeBuilder (part of the \fIHTML-Tree\fR distribution) | |
1083 | .PP | |
1084 | http://www.w3.org/TR/REC\-html40 | |
1085 | .PP | |
1086 | More information about marked sections and processing instructions may | |
1087 | be found at \f(CW\*(C`http://www.sgml.u\-net.com/book/sgml\-8.htm\*(C'\fR. | |
1088 | .SH "COPYRIGHT" | |
1089 | .IX Header "COPYRIGHT" | |
1090 | .Vb 2 | |
1091 | \& Copyright 1996-2003 Gisle Aas. All rights reserved. | |
1092 | \& Copyright 1999-2000 Michael A. Chase. All rights reserved. | |
1093 | .Ve | |
1094 | .PP | |
1095 | This library is free software; you can redistribute it and/or | |
1096 | modify it under the same terms as Perl itself. |