Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | package XML::Parser::Expat; |
2 | ||
3 | require 5.004; | |
4 | ||
5 | use strict; | |
6 | use vars qw($VERSION @ISA %Handler_Setters %Encoding_Table @Encoding_Path | |
7 | $have_File_Spec); | |
8 | use Carp; | |
9 | ||
10 | require DynaLoader; | |
11 | ||
12 | @ISA = qw(DynaLoader); | |
13 | $VERSION = "2.34" ; | |
14 | ||
15 | $have_File_Spec = $INC{'File/Spec.pm'} || do 'File/Spec.pm'; | |
16 | ||
17 | %Encoding_Table = (); | |
18 | if ($have_File_Spec) { | |
19 | @Encoding_Path = (grep(-d $_, | |
20 | map(File::Spec->catdir($_, qw(XML Parser Encodings)), | |
21 | @INC)), | |
22 | File::Spec->curdir); | |
23 | } | |
24 | else { | |
25 | @Encoding_Path = (grep(-d $_, map($_ . '/XML/Parser/Encodings', @INC)), '.'); | |
26 | } | |
27 | ||
28 | ||
29 | bootstrap XML::Parser::Expat $VERSION; | |
30 | ||
31 | %Handler_Setters = ( | |
32 | Start => \&SetStartElementHandler, | |
33 | End => \&SetEndElementHandler, | |
34 | Char => \&SetCharacterDataHandler, | |
35 | Proc => \&SetProcessingInstructionHandler, | |
36 | Comment => \&SetCommentHandler, | |
37 | CdataStart => \&SetStartCdataHandler, | |
38 | CdataEnd => \&SetEndCdataHandler, | |
39 | Default => \&SetDefaultHandler, | |
40 | Unparsed => \&SetUnparsedEntityDeclHandler, | |
41 | Notation => \&SetNotationDeclHandler, | |
42 | ExternEnt => \&SetExternalEntityRefHandler, | |
43 | ExternEntFin => \&SetExtEntFinishHandler, | |
44 | Entity => \&SetEntityDeclHandler, | |
45 | Element => \&SetElementDeclHandler, | |
46 | Attlist => \&SetAttListDeclHandler, | |
47 | Doctype => \&SetDoctypeHandler, | |
48 | DoctypeFin => \&SetEndDoctypeHandler, | |
49 | XMLDecl => \&SetXMLDeclHandler | |
50 | ); | |
51 | ||
52 | sub new { | |
53 | my ($class, %args) = @_; | |
54 | my $self = bless \%args, $_[0]; | |
55 | $args{_State_} = 0; | |
56 | $args{Context} = []; | |
57 | $args{Namespaces} ||= 0; | |
58 | $args{ErrorMessage} ||= ''; | |
59 | if ($args{Namespaces}) { | |
60 | $args{Namespace_Table} = {}; | |
61 | $args{Namespace_List} = [undef]; | |
62 | $args{Prefix_Table} = {}; | |
63 | $args{New_Prefixes} = []; | |
64 | } | |
65 | $args{_Setters} = \%Handler_Setters; | |
66 | $args{Parser} = ParserCreate($self, $args{ProtocolEncoding}, | |
67 | $args{Namespaces}); | |
68 | $self; | |
69 | } | |
70 | ||
71 | sub load_encoding { | |
72 | my ($file) = @_; | |
73 | ||
74 | $file =~ s!([^/]+)$!\L$1\E!; | |
75 | $file .= '.enc' unless $file =~ /\.enc$/; | |
76 | unless ($file =~ m!^/!) { | |
77 | foreach (@Encoding_Path) { | |
78 | my $tmp = ($have_File_Spec | |
79 | ? File::Spec->catfile($_, $file) | |
80 | : "$_/$file"); | |
81 | if (-e $tmp) { | |
82 | $file = $tmp; | |
83 | last; | |
84 | } | |
85 | } | |
86 | } | |
87 | ||
88 | local(*ENC); | |
89 | open(ENC, $file) or croak("Couldn't open encmap $file:\n$!\n"); | |
90 | binmode(ENC); | |
91 | my $data; | |
92 | my $br = sysread(ENC, $data, -s $file); | |
93 | croak("Trouble reading $file:\n$!\n") | |
94 | unless defined($br); | |
95 | close(ENC); | |
96 | ||
97 | my $name = LoadEncoding($data, $br); | |
98 | croak("$file isn't an encmap file") | |
99 | unless defined($name); | |
100 | ||
101 | $name; | |
102 | } # End load_encoding | |
103 | ||
104 | sub setHandlers { | |
105 | my ($self, @handler_pairs) = @_; | |
106 | ||
107 | croak("Uneven number of arguments to setHandlers method") | |
108 | if (int(@handler_pairs) & 1); | |
109 | ||
110 | my @ret; | |
111 | ||
112 | while (@handler_pairs) { | |
113 | my $type = shift @handler_pairs; | |
114 | my $handler = shift @handler_pairs; | |
115 | croak "Handler for $type not a Code ref" | |
116 | unless (! defined($handler) or ! $handler or ref($handler) eq 'CODE'); | |
117 | ||
118 | my $hndl = $self->{_Setters}->{$type}; | |
119 | ||
120 | unless (defined($hndl)) { | |
121 | my @types = sort keys %{$self->{_Setters}}; | |
122 | croak("Unknown Expat handler type: $type\n Valid types: @types"); | |
123 | } | |
124 | ||
125 | my $old = &$hndl($self->{Parser}, $handler); | |
126 | push (@ret, $type, $old); | |
127 | } | |
128 | ||
129 | return @ret; | |
130 | } | |
131 | ||
132 | sub xpcroak | |
133 | { | |
134 | my ($self, $message) = @_; | |
135 | ||
136 | my $eclines = $self->{ErrorContext}; | |
137 | my $line = GetCurrentLineNumber($_[0]->{Parser}); | |
138 | $message .= " at line $line"; | |
139 | $message .= ":\n" . $self->position_in_context($eclines) | |
140 | if defined($eclines); | |
141 | croak $message; | |
142 | } | |
143 | ||
144 | sub xpcarp { | |
145 | my ($self, $message) = @_; | |
146 | ||
147 | my $eclines = $self->{ErrorContext}; | |
148 | my $line = GetCurrentLineNumber($_[0]->{Parser}); | |
149 | $message .= " at line $line"; | |
150 | $message .= ":\n" . $self->position_in_context($eclines) | |
151 | if defined($eclines); | |
152 | carp $message; | |
153 | } | |
154 | ||
155 | sub default_current { | |
156 | my $self = shift; | |
157 | if ($self->{_State_} == 1) { | |
158 | return DefaultCurrent($self->{Parser}); | |
159 | } | |
160 | } | |
161 | ||
162 | sub recognized_string { | |
163 | my $self = shift; | |
164 | if ($self->{_State_} == 1) { | |
165 | return RecognizedString($self->{Parser}); | |
166 | } | |
167 | } | |
168 | ||
169 | sub original_string { | |
170 | my $self = shift; | |
171 | if ($self->{_State_} == 1) { | |
172 | return OriginalString($self->{Parser}); | |
173 | } | |
174 | } | |
175 | ||
176 | sub current_line { | |
177 | my $self = shift; | |
178 | if ($self->{_State_} == 1) { | |
179 | return GetCurrentLineNumber($self->{Parser}); | |
180 | } | |
181 | } | |
182 | ||
183 | sub current_column { | |
184 | my $self = shift; | |
185 | if ($self->{_State_} == 1) { | |
186 | return GetCurrentColumnNumber($self->{Parser}); | |
187 | } | |
188 | } | |
189 | ||
190 | sub current_byte { | |
191 | my $self = shift; | |
192 | if ($self->{_State_} == 1) { | |
193 | return GetCurrentByteIndex($self->{Parser}); | |
194 | } | |
195 | } | |
196 | ||
197 | sub base { | |
198 | my ($self, $newbase) = @_; | |
199 | my $p = $self->{Parser}; | |
200 | my $oldbase = GetBase($p); | |
201 | SetBase($p, $newbase) if @_ > 1; | |
202 | return $oldbase; | |
203 | } | |
204 | ||
205 | sub context { | |
206 | my $ctx = $_[0]->{Context}; | |
207 | @$ctx; | |
208 | } | |
209 | ||
210 | sub current_element { | |
211 | my ($self) = @_; | |
212 | @{$self->{Context}} ? $self->{Context}->[-1] : undef; | |
213 | } | |
214 | ||
215 | sub in_element { | |
216 | my ($self, $element) = @_; | |
217 | @{$self->{Context}} ? $self->eq_name($self->{Context}->[-1], $element) | |
218 | : undef; | |
219 | } | |
220 | ||
221 | sub within_element { | |
222 | my ($self, $element) = @_; | |
223 | my $cnt = 0; | |
224 | foreach (@{$self->{Context}}) { | |
225 | $cnt++ if $self->eq_name($_, $element); | |
226 | } | |
227 | return $cnt; | |
228 | } | |
229 | ||
230 | sub depth { | |
231 | my ($self) = @_; | |
232 | int(@{$self->{Context}}); | |
233 | } | |
234 | ||
235 | sub element_index { | |
236 | my ($self) = @_; | |
237 | ||
238 | if ($self->{_State_} == 1) { | |
239 | return ElementIndex($self->{Parser}); | |
240 | } | |
241 | } | |
242 | ||
243 | ################ | |
244 | # Namespace methods | |
245 | ||
246 | sub namespace { | |
247 | my ($self, $name) = @_; | |
248 | local($^W) = 0; | |
249 | $self->{Namespace_List}->[int($name)]; | |
250 | } | |
251 | ||
252 | sub eq_name { | |
253 | my ($self, $nm1, $nm2) = @_; | |
254 | local($^W) = 0; | |
255 | ||
256 | int($nm1) == int($nm2) and $nm1 eq $nm2; | |
257 | } | |
258 | ||
259 | sub generate_ns_name { | |
260 | my ($self, $name, $namespace) = @_; | |
261 | ||
262 | $namespace ? | |
263 | GenerateNSName($name, $namespace, $self->{Namespace_Table}, | |
264 | $self->{Namespace_List}) | |
265 | : $name; | |
266 | } | |
267 | ||
268 | sub new_ns_prefixes { | |
269 | my ($self) = @_; | |
270 | if ($self->{Namespaces}) { | |
271 | return @{$self->{New_Prefixes}}; | |
272 | } | |
273 | return (); | |
274 | } | |
275 | ||
276 | sub expand_ns_prefix { | |
277 | my ($self, $prefix) = @_; | |
278 | ||
279 | if ($self->{Namespaces}) { | |
280 | my $stack = $self->{Prefix_Table}->{$prefix}; | |
281 | return (defined($stack) and @$stack) ? $stack->[-1] : undef; | |
282 | } | |
283 | ||
284 | return undef; | |
285 | } | |
286 | ||
287 | sub current_ns_prefixes { | |
288 | my ($self) = @_; | |
289 | ||
290 | if ($self->{Namespaces}) { | |
291 | my %set = %{$self->{Prefix_Table}}; | |
292 | ||
293 | if (exists $set{'#default'} and not defined($set{'#default'}->[-1])) { | |
294 | delete $set{'#default'}; | |
295 | } | |
296 | ||
297 | return keys %set; | |
298 | } | |
299 | ||
300 | return (); | |
301 | } | |
302 | ||
303 | ||
304 | ################################################################ | |
305 | # Namespace declaration handlers | |
306 | # | |
307 | ||
308 | sub NamespaceStart { | |
309 | my ($self, $prefix, $uri) = @_; | |
310 | ||
311 | $prefix = '#default' unless defined $prefix; | |
312 | my $stack = $self->{Prefix_Table}->{$prefix}; | |
313 | ||
314 | if (defined $stack) { | |
315 | push(@$stack, $uri); | |
316 | } | |
317 | else { | |
318 | $self->{Prefix_Table}->{$prefix} = [$uri]; | |
319 | } | |
320 | ||
321 | # The New_Prefixes list gets emptied at end of startElement function | |
322 | # in Expat.xs | |
323 | ||
324 | push(@{$self->{New_Prefixes}}, $prefix); | |
325 | } | |
326 | ||
327 | sub NamespaceEnd { | |
328 | my ($self, $prefix) = @_; | |
329 | ||
330 | $prefix = '#default' unless defined $prefix; | |
331 | ||
332 | my $stack = $self->{Prefix_Table}->{$prefix}; | |
333 | if (@$stack > 1) { | |
334 | pop(@$stack); | |
335 | } | |
336 | else { | |
337 | delete $self->{Prefix_Table}->{$prefix}; | |
338 | } | |
339 | } | |
340 | ||
341 | ################ | |
342 | ||
343 | sub specified_attr { | |
344 | my $self = shift; | |
345 | ||
346 | if ($self->{_State_} == 1) { | |
347 | return GetSpecifiedAttributeCount($self->{Parser}); | |
348 | } | |
349 | } | |
350 | ||
351 | sub finish { | |
352 | my ($self) = @_; | |
353 | if ($self->{_State_} == 1) { | |
354 | my $parser = $self->{Parser}; | |
355 | UnsetAllHandlers($parser); | |
356 | } | |
357 | } | |
358 | ||
359 | sub position_in_context { | |
360 | my ($self, $lines) = @_; | |
361 | if ($self->{_State_} == 1) { | |
362 | my $parser = $self->{Parser}; | |
363 | my ($string, $linepos) = PositionContext($parser, $lines); | |
364 | ||
365 | return '' unless defined($string); | |
366 | ||
367 | my $col = GetCurrentColumnNumber($parser); | |
368 | my $ptr = ('=' x ($col - 1)) . '^' . "\n"; | |
369 | my $ret; | |
370 | my $dosplit = $linepos < length($string); | |
371 | ||
372 | $string .= "\n" unless $string =~ /\n$/; | |
373 | ||
374 | if ($dosplit) { | |
375 | $ret = substr($string, 0, $linepos) . $ptr | |
376 | . substr($string, $linepos); | |
377 | } else { | |
378 | $ret = $string . $ptr; | |
379 | } | |
380 | ||
381 | return $ret; | |
382 | } | |
383 | } | |
384 | ||
385 | sub xml_escape { | |
386 | my $self = shift; | |
387 | my $text = shift; | |
388 | ||
389 | study $text; | |
390 | $text =~ s/\&/\&/g; | |
391 | $text =~ s/</\</g; | |
392 | foreach (@_) { | |
393 | croak "xml_escape: '$_' isn't a single character" if length($_) > 1; | |
394 | ||
395 | if ($_ eq '>') { | |
396 | $text =~ s/>/\>/g; | |
397 | } | |
398 | elsif ($_ eq '"') { | |
399 | $text =~ s/\"/\"/; | |
400 | } | |
401 | elsif ($_ eq "'") { | |
402 | $text =~ s/\'/\'/; | |
403 | } | |
404 | else { | |
405 | my $rep = '&#' . sprintf('x%X', ord($_)) . ';'; | |
406 | if (/\W/) { | |
407 | my $ptrn = "\\$_"; | |
408 | $text =~ s/$ptrn/$rep/g; | |
409 | } | |
410 | else { | |
411 | $text =~ s/$_/$rep/g; | |
412 | } | |
413 | } | |
414 | } | |
415 | $text; | |
416 | } | |
417 | ||
418 | sub skip_until { | |
419 | my $self = shift; | |
420 | if ($self->{_State_} <= 1) { | |
421 | SkipUntil($self->{Parser}, $_[0]); | |
422 | } | |
423 | } | |
424 | ||
425 | sub release { | |
426 | my $self = shift; | |
427 | ParserRelease($self->{Parser}); | |
428 | } | |
429 | ||
430 | sub DESTROY { | |
431 | my $self = shift; | |
432 | ParserFree($self->{Parser}); | |
433 | } | |
434 | ||
435 | sub parse { | |
436 | my $self = shift; | |
437 | my $arg = shift; | |
438 | croak "Parse already in progress (Expat)" if $self->{_State_}; | |
439 | $self->{_State_} = 1; | |
440 | my $parser = $self->{Parser}; | |
441 | my $ioref; | |
442 | my $result = 0; | |
443 | ||
444 | if (defined $arg) { | |
445 | if (ref($arg) and UNIVERSAL::isa($arg, 'IO::Handle')) { | |
446 | $ioref = $arg; | |
447 | } elsif (tied($arg)) { | |
448 | my $class = ref($arg); | |
449 | no strict 'refs'; | |
450 | $ioref = $arg if defined &{"${class}::TIEHANDLE"}; | |
451 | } | |
452 | else { | |
453 | require IO::Handle; | |
454 | eval { | |
455 | no strict 'refs'; | |
456 | $ioref = *{$arg}{IO} if defined *{$arg}; | |
457 | }; | |
458 | undef $@; | |
459 | } | |
460 | } | |
461 | ||
462 | if (defined($ioref)) { | |
463 | my $delim = $self->{Stream_Delimiter}; | |
464 | my $prev_rs; | |
465 | ||
466 | $prev_rs = ref($ioref)->input_record_separator("\n$delim\n") | |
467 | if defined($delim); | |
468 | ||
469 | $result = ParseStream($parser, $ioref, $delim); | |
470 | ||
471 | ref($ioref)->input_record_separator($prev_rs) | |
472 | if defined($delim); | |
473 | } else { | |
474 | $result = ParseString($parser, $arg); | |
475 | } | |
476 | ||
477 | $self->{_State_} = 2; | |
478 | $result or croak $self->{ErrorMessage}; | |
479 | } | |
480 | ||
481 | sub parsestring { | |
482 | my $self = shift; | |
483 | $self->parse(@_); | |
484 | } | |
485 | ||
486 | sub parsefile { | |
487 | my $self = shift; | |
488 | croak "Parser has already been used" if $self->{_State_}; | |
489 | local(*FILE); | |
490 | open(FILE, $_[0]) or croak "Couldn't open $_[0]:\n$!"; | |
491 | binmode(FILE); | |
492 | my $ret = $self->parse(*FILE); | |
493 | close(FILE); | |
494 | $ret; | |
495 | } | |
496 | ||
497 | ################################################################ | |
498 | package XML::Parser::ContentModel; | |
499 | use overload '""' => \&asString, 'eq' => \&thiseq; | |
500 | ||
501 | sub EMPTY () {1} | |
502 | sub ANY () {2} | |
503 | sub MIXED () {3} | |
504 | sub NAME () {4} | |
505 | sub CHOICE () {5} | |
506 | sub SEQ () {6} | |
507 | ||
508 | ||
509 | sub isempty { | |
510 | return $_[0]->{Type} == EMPTY; | |
511 | } | |
512 | ||
513 | sub isany { | |
514 | return $_[0]->{Type} == ANY; | |
515 | } | |
516 | ||
517 | sub ismixed { | |
518 | return $_[0]->{Type} == MIXED; | |
519 | } | |
520 | ||
521 | sub isname { | |
522 | return $_[0]->{Type} == NAME; | |
523 | } | |
524 | ||
525 | sub name { | |
526 | return $_[0]->{Tag}; | |
527 | } | |
528 | ||
529 | sub ischoice { | |
530 | return $_[0]->{Type} == CHOICE; | |
531 | } | |
532 | ||
533 | sub isseq { | |
534 | return $_[0]->{Type} == SEQ; | |
535 | } | |
536 | ||
537 | sub quant { | |
538 | return $_[0]->{Quant}; | |
539 | } | |
540 | ||
541 | sub children { | |
542 | my $children = $_[0]->{Children}; | |
543 | if (defined $children) { | |
544 | return @$children; | |
545 | } | |
546 | return undef; | |
547 | } | |
548 | ||
549 | sub asString { | |
550 | my ($self) = @_; | |
551 | my $ret; | |
552 | ||
553 | if ($self->{Type} == NAME) { | |
554 | $ret = $self->{Tag}; | |
555 | } | |
556 | elsif ($self->{Type} == EMPTY) { | |
557 | return "EMPTY"; | |
558 | } | |
559 | elsif ($self->{Type} == ANY) { | |
560 | return "ANY"; | |
561 | } | |
562 | elsif ($self->{Type} == MIXED) { | |
563 | $ret = '(#PCDATA'; | |
564 | foreach (@{$self->{Children}}) { | |
565 | $ret .= '|' . $_; | |
566 | } | |
567 | $ret .= ')'; | |
568 | } | |
569 | else { | |
570 | my $sep = $self->{Type} == CHOICE ? '|' : ','; | |
571 | $ret = '(' . join($sep, map { $_->asString } @{$self->{Children}}) . ')'; | |
572 | } | |
573 | ||
574 | $ret .= $self->{Quant} if $self->{Quant}; | |
575 | return $ret; | |
576 | } | |
577 | ||
578 | sub thiseq { | |
579 | my $self = shift; | |
580 | ||
581 | return $self->asString eq $_[0]; | |
582 | } | |
583 | ||
584 | ################################################################ | |
585 | package XML::Parser::ExpatNB; | |
586 | ||
587 | use vars qw(@ISA); | |
588 | use Carp; | |
589 | ||
590 | @ISA = qw(XML::Parser::Expat); | |
591 | ||
592 | sub parse { | |
593 | my $self = shift; | |
594 | my $class = ref($self); | |
595 | croak "parse method not supported in $class"; | |
596 | } | |
597 | ||
598 | sub parsestring { | |
599 | my $self = shift; | |
600 | my $class = ref($self); | |
601 | croak "parsestring method not supported in $class"; | |
602 | } | |
603 | ||
604 | sub parsefile { | |
605 | my $self = shift; | |
606 | my $class = ref($self); | |
607 | croak "parsefile method not supported in $class"; | |
608 | } | |
609 | ||
610 | sub parse_more { | |
611 | my ($self, $data) = @_; | |
612 | ||
613 | $self->{_State_} = 1; | |
614 | my $ret = XML::Parser::Expat::ParsePartial($self->{Parser}, $data); | |
615 | ||
616 | croak $self->{ErrorMessage} unless $ret; | |
617 | } | |
618 | ||
619 | sub parse_done { | |
620 | my $self = shift; | |
621 | ||
622 | my $ret = XML::Parser::Expat::ParseDone($self->{Parser}); | |
623 | unless ($ret) { | |
624 | my $msg = $self->{ErrorMessage}; | |
625 | $self->release; | |
626 | croak $msg; | |
627 | } | |
628 | ||
629 | $self->{_State_} = 2; | |
630 | ||
631 | my $result = $ret; | |
632 | my @result = (); | |
633 | my $final = $self->{FinalHandler}; | |
634 | if (defined $final) { | |
635 | if (wantarray) { | |
636 | @result = &$final($self); | |
637 | } | |
638 | else { | |
639 | $result = &$final($self); | |
640 | } | |
641 | } | |
642 | ||
643 | $self->release; | |
644 | ||
645 | return unless defined wantarray; | |
646 | return wantarray ? @result : $result; | |
647 | } | |
648 | ||
649 | ################################################################ | |
650 | ||
651 | package XML::Parser::Encinfo; | |
652 | ||
653 | sub DESTROY { | |
654 | my $self = shift; | |
655 | XML::Parser::Expat::FreeEncoding($self); | |
656 | } | |
657 | ||
658 | 1; | |
659 | ||
660 | __END__ | |
661 | ||
662 | =head1 NAME | |
663 | ||
664 | XML::Parser::Expat - Lowlevel access to James Clark's expat XML parser | |
665 | ||
666 | =head1 SYNOPSIS | |
667 | ||
668 | use XML::Parser::Expat; | |
669 | ||
670 | $parser = new XML::Parser::Expat; | |
671 | $parser->setHandlers('Start' => \&sh, | |
672 | 'End' => \&eh, | |
673 | 'Char' => \&ch); | |
674 | open(FOO, 'info.xml') or die "Couldn't open"; | |
675 | $parser->parse(*FOO); | |
676 | close(FOO); | |
677 | # $parser->parse('<foo id="me"> here <em>we</em> go </foo>'); | |
678 | ||
679 | sub sh | |
680 | { | |
681 | my ($p, $el, %atts) = @_; | |
682 | $p->setHandlers('Char' => \&spec) | |
683 | if ($el eq 'special'); | |
684 | ... | |
685 | } | |
686 | ||
687 | sub eh | |
688 | { | |
689 | my ($p, $el) = @_; | |
690 | $p->setHandlers('Char' => \&ch) # Special elements won't contain | |
691 | if ($el eq 'special'); # other special elements | |
692 | ... | |
693 | } | |
694 | ||
695 | =head1 DESCRIPTION | |
696 | ||
697 | This module provides an interface to James Clark's XML parser, expat. As in | |
698 | expat, a single instance of the parser can only parse one document. Calls | |
699 | to parsestring after the first for a given instance will die. | |
700 | ||
701 | Expat (and XML::Parser::Expat) are event based. As the parser recognizes | |
702 | parts of the document (say the start or end of an XML element), then any | |
703 | handlers registered for that type of an event are called with suitable | |
704 | parameters. | |
705 | ||
706 | =head1 METHODS | |
707 | ||
708 | =over 4 | |
709 | ||
710 | =item new | |
711 | ||
712 | This is a class method, the constructor for XML::Parser::Expat. Options are | |
713 | passed as keyword value pairs. The recognized options are: | |
714 | ||
715 | =over 4 | |
716 | ||
717 | =item * ProtocolEncoding | |
718 | ||
719 | The protocol encoding name. The default is none. The expat built-in | |
720 | encodings are: C<UTF-8>, C<ISO-8859-1>, C<UTF-16>, and C<US-ASCII>. | |
721 | Other encodings may be used if they have encoding maps in one of the | |
722 | directories in the @Encoding_Path list. Setting the protocol encoding | |
723 | overrides any encoding in the XML declaration. | |
724 | ||
725 | =item * Namespaces | |
726 | ||
727 | When this option is given with a true value, then the parser does namespace | |
728 | processing. By default, namespace processing is turned off. When it is | |
729 | turned on, the parser consumes I<xmlns> attributes and strips off prefixes | |
730 | from element and attributes names where those prefixes have a defined | |
731 | namespace. A name's namespace can be found using the L<"namespace"> method | |
732 | and two names can be checked for absolute equality with the L<"eq_name"> | |
733 | method. | |
734 | ||
735 | =item * NoExpand | |
736 | ||
737 | Normally, the parser will try to expand references to entities defined in | |
738 | the internal subset. If this option is set to a true value, and a default | |
739 | handler is also set, then the default handler will be called when an | |
740 | entity reference is seen in text. This has no effect if a default handler | |
741 | has not been registered, and it has no effect on the expansion of entity | |
742 | references inside attribute values. | |
743 | ||
744 | =item * Stream_Delimiter | |
745 | ||
746 | This option takes a string value. When this string is found alone on a line | |
747 | while parsing from a stream, then the parse is ended as if it saw an end of | |
748 | file. The intended use is with a stream of xml documents in a MIME multipart | |
749 | format. The string should not contain a trailing newline. | |
750 | ||
751 | =item * ErrorContext | |
752 | ||
753 | When this option is defined, errors are reported in context. The value | |
754 | of ErrorContext should be the number of lines to show on either side of | |
755 | the line in which the error occurred. | |
756 | ||
757 | =item * ParseParamEnt | |
758 | ||
759 | Unless standalone is set to "yes" in the XML declaration, setting this to | |
760 | a true value allows the external DTD to be read, and parameter entities | |
761 | to be parsed and expanded. | |
762 | ||
763 | =item * Base | |
764 | ||
765 | The base to use for relative pathnames or URLs. This can also be done by | |
766 | using the base method. | |
767 | ||
768 | =back | |
769 | ||
770 | =item setHandlers(TYPE, HANDLER [, TYPE, HANDLER [...]]) | |
771 | ||
772 | This method registers handlers for the various events. If no handlers are | |
773 | registered, then a call to parsestring or parsefile will only determine if | |
774 | the corresponding XML document is well formed (by returning without error.) | |
775 | This may be called from within a handler, after the parse has started. | |
776 | ||
777 | Setting a handler to something that evaluates to false unsets that | |
778 | handler. | |
779 | ||
780 | This method returns a list of type, handler pairs corresponding to the | |
781 | input. The handlers returned are the ones that were in effect before the | |
782 | call to setHandlers. | |
783 | ||
784 | The recognized events and the parameters passed to the corresponding | |
785 | handlers are: | |
786 | ||
787 | =over 4 | |
788 | ||
789 | =item * Start (Parser, Element [, Attr, Val [,...]]) | |
790 | ||
791 | This event is generated when an XML start tag is recognized. Parser is | |
792 | an XML::Parser::Expat instance. Element is the name of the XML element that | |
793 | is opened with the start tag. The Attr & Val pairs are generated for each | |
794 | attribute in the start tag. | |
795 | ||
796 | =item * End (Parser, Element) | |
797 | ||
798 | This event is generated when an XML end tag is recognized. Note that | |
799 | an XML empty tag (<foo/>) generates both a start and an end event. | |
800 | ||
801 | There is always a lower level start and end handler installed that wrap | |
802 | the corresponding callbacks. This is to handle the context mechanism. | |
803 | A consequence of this is that the default handler (see below) will not | |
804 | see a start tag or end tag unless the default_current method is called. | |
805 | ||
806 | =item * Char (Parser, String) | |
807 | ||
808 | This event is generated when non-markup is recognized. The non-markup | |
809 | sequence of characters is in String. A single non-markup sequence of | |
810 | characters may generate multiple calls to this handler. Whatever the | |
811 | encoding of the string in the original document, this is given to the | |
812 | handler in UTF-8. | |
813 | ||
814 | =item * Proc (Parser, Target, Data) | |
815 | ||
816 | This event is generated when a processing instruction is recognized. | |
817 | ||
818 | =item * Comment (Parser, String) | |
819 | ||
820 | This event is generated when a comment is recognized. | |
821 | ||
822 | =item * CdataStart (Parser) | |
823 | ||
824 | This is called at the start of a CDATA section. | |
825 | ||
826 | =item * CdataEnd (Parser) | |
827 | ||
828 | This is called at the end of a CDATA section. | |
829 | ||
830 | =item * Default (Parser, String) | |
831 | ||
832 | This is called for any characters that don't have a registered handler. | |
833 | This includes both characters that are part of markup for which no | |
834 | events are generated (markup declarations) and characters that | |
835 | could generate events, but for which no handler has been registered. | |
836 | ||
837 | Whatever the encoding in the original document, the string is returned to | |
838 | the handler in UTF-8. | |
839 | ||
840 | =item * Unparsed (Parser, Entity, Base, Sysid, Pubid, Notation) | |
841 | ||
842 | This is called for a declaration of an unparsed entity. Entity is the name | |
843 | of the entity. Base is the base to be used for resolving a relative URI. | |
844 | Sysid is the system id. Pubid is the public id. Notation is the notation | |
845 | name. Base and Pubid may be undefined. | |
846 | ||
847 | =item * Notation (Parser, Notation, Base, Sysid, Pubid) | |
848 | ||
849 | This is called for a declaration of notation. Notation is the notation name. | |
850 | Base is the base to be used for resolving a relative URI. Sysid is the system | |
851 | id. Pubid is the public id. Base, Sysid, and Pubid may all be undefined. | |
852 | ||
853 | =item * ExternEnt (Parser, Base, Sysid, Pubid) | |
854 | ||
855 | This is called when an external entity is referenced. Base is the base to be | |
856 | used for resolving a relative URI. Sysid is the system id. Pubid is the public | |
857 | id. Base, and Pubid may be undefined. | |
858 | ||
859 | This handler should either return a string, which represents the contents of | |
860 | the external entity, or return an open filehandle that can be read to obtain | |
861 | the contents of the external entity, or return undef, which indicates the | |
862 | external entity couldn't be found and will generate a parse error. | |
863 | ||
864 | If an open filehandle is returned, it must be returned as either a glob | |
865 | (*FOO) or as a reference to a glob (e.g. an instance of IO::Handle). | |
866 | ||
867 | =item * ExternEntFin (Parser) | |
868 | ||
869 | This is called after an external entity has been parsed. It allows | |
870 | applications to perform cleanup on actions performed in the above | |
871 | ExternEnt handler. | |
872 | ||
873 | =item * Entity (Parser, Name, Val, Sysid, Pubid, Ndata, IsParam) | |
874 | ||
875 | This is called when an entity is declared. For internal entities, the Val | |
876 | parameter will contain the value and the remaining three parameters will | |
877 | be undefined. For external entities, the Val parameter | |
878 | will be undefined, the Sysid parameter will have the system id, the Pubid | |
879 | parameter will have the public id if it was provided (it will be undefined | |
880 | otherwise), the Ndata parameter will contain the notation for unparsed | |
881 | entities. If this is a parameter entity declaration, then the IsParam | |
882 | parameter is true. | |
883 | ||
884 | Note that this handler and the Unparsed handler above overlap. If both are | |
885 | set, then this handler will not be called for unparsed entities. | |
886 | ||
887 | =item * Element (Parser, Name, Model) | |
888 | ||
889 | The element handler is called when an element declaration is found. Name is | |
890 | the element name, and Model is the content model as an | |
891 | XML::Parser::ContentModel object. See L<"XML::Parser::ContentModel Methods"> | |
892 | for methods available for this class. | |
893 | ||
894 | =item * Attlist (Parser, Elname, Attname, Type, Default, Fixed) | |
895 | ||
896 | This handler is called for each attribute in an ATTLIST declaration. | |
897 | So an ATTLIST declaration that has multiple attributes | |
898 | will generate multiple calls to this handler. The Elname parameter is the | |
899 | name of the element with which the attribute is being associated. The Attname | |
900 | parameter is the name of the attribute. Type is the attribute type, given as | |
901 | a string. Default is the default value, which will either be "#REQUIRED", | |
902 | "#IMPLIED" or a quoted string (i.e. the returned string will begin and end | |
903 | with a quote character). If Fixed is true, then this is a fixed attribute. | |
904 | ||
905 | =item * Doctype (Parser, Name, Sysid, Pubid, Internal) | |
906 | ||
907 | This handler is called for DOCTYPE declarations. Name is the document type | |
908 | name. Sysid is the system id of the document type, if it was provided, | |
909 | otherwise it's undefined. Pubid is the public id of the document type, | |
910 | which will be undefined if no public id was given. Internal will be | |
911 | true or false, indicating whether or not the doctype declaration contains | |
912 | an internal subset. | |
913 | ||
914 | =item * DoctypeFin (Parser) | |
915 | ||
916 | This handler is called after parsing of the DOCTYPE declaration has finished, | |
917 | including any internal or external DTD declarations. | |
918 | ||
919 | =item * XMLDecl (Parser, Version, Encoding, Standalone) | |
920 | ||
921 | This handler is called for XML declarations. Version is a string containg | |
922 | the version. Encoding is either undefined or contains an encoding string. | |
923 | Standalone is either undefined, or true or false. Undefined indicates | |
924 | that no standalone parameter was given in the XML declaration. True or | |
925 | false indicates "yes" or "no" respectively. | |
926 | ||
927 | =back | |
928 | ||
929 | =item namespace(name) | |
930 | ||
931 | Return the URI of the namespace that the name belongs to. If the name doesn't | |
932 | belong to any namespace, an undef is returned. This is only valid on names | |
933 | received through the Start or End handlers from a single document, or through | |
934 | a call to the generate_ns_name method. In other words, don't use names | |
935 | generated from one instance of XML::Parser::Expat with other instances. | |
936 | ||
937 | =item eq_name(name1, name2) | |
938 | ||
939 | Return true if name1 and name2 are identical (i.e. same name and from | |
940 | the same namespace.) This is only meaningful if both names were obtained | |
941 | through the Start or End handlers from a single document, or through | |
942 | a call to the generate_ns_name method. | |
943 | ||
944 | =item generate_ns_name(name, namespace) | |
945 | ||
946 | Return a name, associated with a given namespace, good for using with the | |
947 | above 2 methods. The namespace argument should be the namespace URI, not | |
948 | a prefix. | |
949 | ||
950 | =item new_ns_prefixes | |
951 | ||
952 | When called from a start tag handler, returns namespace prefixes declared | |
953 | with this start tag. If called elsewere (or if there were no namespace | |
954 | prefixes declared), it returns an empty list. Setting of the default | |
955 | namespace is indicated with '#default' as a prefix. | |
956 | ||
957 | =item expand_ns_prefix(prefix) | |
958 | ||
959 | Return the uri to which the given prefix is currently bound. Returns | |
960 | undef if the prefix isn't currently bound. Use '#default' to find the | |
961 | current binding of the default namespace (if any). | |
962 | ||
963 | =item current_ns_prefixes | |
964 | ||
965 | Return a list of currently bound namespace prefixes. The order of the | |
966 | the prefixes in the list has no meaning. If the default namespace is | |
967 | currently bound, '#default' appears in the list. | |
968 | ||
969 | =item recognized_string | |
970 | ||
971 | Returns the string from the document that was recognized in order to call | |
972 | the current handler. For instance, when called from a start handler, it | |
973 | will give us the the start-tag string. The string is encoded in UTF-8. | |
974 | This method doesn't return a meaningful string inside declaration handlers. | |
975 | ||
976 | =item original_string | |
977 | ||
978 | Returns the verbatim string from the document that was recognized in | |
979 | order to call the current handler. The string is in the original document | |
980 | encoding. This method doesn't return a meaningful string inside declaration | |
981 | handlers. | |
982 | ||
983 | =item default_current | |
984 | ||
985 | When called from a handler, causes the sequence of characters that generated | |
986 | the corresponding event to be sent to the default handler (if one is | |
987 | registered). Use of this method is deprecated in favor the recognized_string | |
988 | method, which you can use without installing a default handler. This | |
989 | method doesn't deliver a meaningful string to the default handler when | |
990 | called from inside declaration handlers. | |
991 | ||
992 | =item xpcroak(message) | |
993 | ||
994 | Concatenate onto the given message the current line number within the | |
995 | XML document plus the message implied by ErrorContext. Then croak with | |
996 | the formed message. | |
997 | ||
998 | =item xpcarp(message) | |
999 | ||
1000 | Concatenate onto the given message the current line number within the | |
1001 | XML document plus the message implied by ErrorContext. Then carp with | |
1002 | the formed message. | |
1003 | ||
1004 | =item current_line | |
1005 | ||
1006 | Returns the line number of the current position of the parse. | |
1007 | ||
1008 | =item current_column | |
1009 | ||
1010 | Returns the column number of the current position of the parse. | |
1011 | ||
1012 | =item current_byte | |
1013 | ||
1014 | Returns the current position of the parse. | |
1015 | ||
1016 | =item base([NEWBASE]); | |
1017 | ||
1018 | Returns the current value of the base for resolving relative URIs. If | |
1019 | NEWBASE is supplied, changes the base to that value. | |
1020 | ||
1021 | =item context | |
1022 | ||
1023 | Returns a list of element names that represent open elements, with the | |
1024 | last one being the innermost. Inside start and end tag handlers, this | |
1025 | will be the tag of the parent element. | |
1026 | ||
1027 | =item current_element | |
1028 | ||
1029 | Returns the name of the innermost currently opened element. Inside | |
1030 | start or end handlers, returns the parent of the element associated | |
1031 | with those tags. | |
1032 | ||
1033 | =item in_element(NAME) | |
1034 | ||
1035 | Returns true if NAME is equal to the name of the innermost currently opened | |
1036 | element. If namespace processing is being used and you want to check | |
1037 | against a name that may be in a namespace, then use the generate_ns_name | |
1038 | method to create the NAME argument. | |
1039 | ||
1040 | =item within_element(NAME) | |
1041 | ||
1042 | Returns the number of times the given name appears in the context list. | |
1043 | If namespace processing is being used and you want to check | |
1044 | against a name that may be in a namespace, then use the generate_ns_name | |
1045 | method to create the NAME argument. | |
1046 | ||
1047 | =item depth | |
1048 | ||
1049 | Returns the size of the context list. | |
1050 | ||
1051 | =item element_index | |
1052 | ||
1053 | Returns an integer that is the depth-first visit order of the current | |
1054 | element. This will be zero outside of the root element. For example, | |
1055 | this will return 1 when called from the start handler for the root element | |
1056 | start tag. | |
1057 | ||
1058 | =item skip_until(INDEX) | |
1059 | ||
1060 | INDEX is an integer that represents an element index. When this method | |
1061 | is called, all handlers are suspended until the start tag for an element | |
1062 | that has an index number equal to INDEX is seen. If a start handler has | |
1063 | been set, then this is the first tag that the start handler will see | |
1064 | after skip_until has been called. | |
1065 | ||
1066 | ||
1067 | =item position_in_context(LINES) | |
1068 | ||
1069 | Returns a string that shows the current parse position. LINES should be | |
1070 | an integer >= 0 that represents the number of lines on either side of the | |
1071 | current parse line to place into the returned string. | |
1072 | ||
1073 | =item xml_escape(TEXT [, CHAR [, CHAR ...]]) | |
1074 | ||
1075 | Returns TEXT with markup characters turned into character entities. Any | |
1076 | additional characters provided as arguments are also turned into character | |
1077 | references where found in TEXT. | |
1078 | ||
1079 | =item parse (SOURCE) | |
1080 | ||
1081 | The SOURCE parameter should either be a string containing the whole XML | |
1082 | document, or it should be an open IO::Handle. Only a single document | |
1083 | may be parsed for a given instance of XML::Parser::Expat, so this will croak | |
1084 | if it's been called previously for this instance. | |
1085 | ||
1086 | =item parsestring(XML_DOC_STRING) | |
1087 | ||
1088 | Parses the given string as an XML document. Only a single document may be | |
1089 | parsed for a given instance of XML::Parser::Expat, so this will die if either | |
1090 | parsestring or parsefile has been called for this instance previously. | |
1091 | ||
1092 | This method is deprecated in favor of the parse method. | |
1093 | ||
1094 | =item parsefile(FILENAME) | |
1095 | ||
1096 | Parses the XML document in the given file. Will die if parsestring or | |
1097 | parsefile has been called previously for this instance. | |
1098 | ||
1099 | =item is_defaulted(ATTNAME) | |
1100 | ||
1101 | NO LONGER WORKS. To find out if an attribute is defaulted please use | |
1102 | the specified_attr method. | |
1103 | ||
1104 | =item specified_attr | |
1105 | ||
1106 | When the start handler receives lists of attributes and values, the | |
1107 | non-defaulted (i.e. explicitly specified) attributes occur in the list | |
1108 | first. This method returns the number of specified items in the list. | |
1109 | So if this number is equal to the length of the list, there were no | |
1110 | defaulted values. Otherwise the number points to the index of the | |
1111 | first defaulted attribute name. | |
1112 | ||
1113 | =item finish | |
1114 | ||
1115 | Unsets all handlers (including internal ones that set context), but expat | |
1116 | continues parsing to the end of the document or until it finds an error. | |
1117 | It should finish up a lot faster than with the handlers set. | |
1118 | ||
1119 | =item release | |
1120 | ||
1121 | There are data structures used by XML::Parser::Expat that have circular | |
1122 | references. This means that these structures will never be garbage | |
1123 | collected unless these references are explicitly broken. Calling this | |
1124 | method breaks those references (and makes the instance unusable.) | |
1125 | ||
1126 | Normally, higher level calls handle this for you, but if you are using | |
1127 | XML::Parser::Expat directly, then it's your responsibility to call it. | |
1128 | ||
1129 | =back | |
1130 | ||
1131 | =head2 XML::Parser::ContentModel Methods | |
1132 | ||
1133 | The element declaration handlers are passed objects of this class as the | |
1134 | content model of the element declaration. They also represent content | |
1135 | particles, components of a content model. | |
1136 | ||
1137 | When referred to as a string, these objects are automagicly converted to a | |
1138 | string representation of the model (or content particle). | |
1139 | ||
1140 | =over 4 | |
1141 | ||
1142 | =item isempty | |
1143 | ||
1144 | This method returns true if the object is "EMPTY", false otherwise. | |
1145 | ||
1146 | =item isany | |
1147 | ||
1148 | This method returns true if the object is "ANY", false otherwise. | |
1149 | ||
1150 | =item ismixed | |
1151 | ||
1152 | This method returns true if the object is "(#PCDATA)" or "(#PCDATA|...)*", | |
1153 | false otherwise. | |
1154 | ||
1155 | =item isname | |
1156 | ||
1157 | This method returns if the object is an element name. | |
1158 | ||
1159 | =item ischoice | |
1160 | ||
1161 | This method returns true if the object is a choice of content particles. | |
1162 | ||
1163 | ||
1164 | =item isseq | |
1165 | ||
1166 | This method returns true if the object is a sequence of content particles. | |
1167 | ||
1168 | =item quant | |
1169 | ||
1170 | This method returns undef or a string representing the quantifier | |
1171 | ('?', '*', '+') associated with the model or particle. | |
1172 | ||
1173 | =item children | |
1174 | ||
1175 | This method returns undef or (for mixed, choice, and sequence types) | |
1176 | an array of component content particles. There will always be at least | |
1177 | one component for choices and sequences, but for a mixed content model | |
1178 | of pure PCDATA, "(#PCDATA)", then an undef is returned. | |
1179 | ||
1180 | =back | |
1181 | ||
1182 | =head2 XML::Parser::ExpatNB Methods | |
1183 | ||
1184 | The class XML::Parser::ExpatNB is a subclass of XML::Parser::Expat used | |
1185 | for non-blocking access to the expat library. It does not support the parse, | |
1186 | parsestring, or parsefile methods, but it does have these additional methods: | |
1187 | ||
1188 | =over 4 | |
1189 | ||
1190 | =item parse_more(DATA) | |
1191 | ||
1192 | Feed expat more text to munch on. | |
1193 | ||
1194 | =item parse_done | |
1195 | ||
1196 | Tell expat that it's gotten the whole document. | |
1197 | ||
1198 | =back | |
1199 | ||
1200 | =head1 FUNCTIONS | |
1201 | ||
1202 | =over 4 | |
1203 | ||
1204 | =item XML::Parser::Expat::load_encoding(ENCODING) | |
1205 | ||
1206 | Load an external encoding. ENCODING is either the name of an encoding or | |
1207 | the name of a file. The basename is converted to lowercase and a '.enc' | |
1208 | extension is appended unless there's one already there. Then, unless | |
1209 | it's an absolute pathname (i.e. begins with '/'), the first file by that | |
1210 | name discovered in the @Encoding_Path path list is used. | |
1211 | ||
1212 | The encoding in the file is loaded and kept in the %Encoding_Table | |
1213 | table. Earlier encodings of the same name are replaced. | |
1214 | ||
1215 | This function is automaticly called by expat when it encounters an encoding | |
1216 | it doesn't know about. Expat shouldn't call this twice for the same | |
1217 | encoding name. The only reason users should use this function is to | |
1218 | explicitly load an encoding not contained in the @Encoding_Path list. | |
1219 | ||
1220 | =back | |
1221 | ||
1222 | =head1 AUTHORS | |
1223 | ||
1224 | Larry Wall <F<larry@wall.org>> wrote version 1.0. | |
1225 | ||
1226 | Clark Cooper <F<coopercc@netheaven.com>> picked up support, changed the API | |
1227 | for this version (2.x), provided documentation, and added some standard | |
1228 | package features. | |
1229 | ||
1230 | =cut |