Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | package HTML::PullParser; |
2 | ||
3 | # $Id: PullParser.pm,v 2.6 2001/04/02 23:26:18 gisle Exp $ | |
4 | ||
5 | require HTML::Parser; | |
6 | @ISA=qw(HTML::Parser); | |
7 | $VERSION = sprintf("%d.%02d", q$Revision: 2.6 $ =~ /(\d+)\.(\d+)/); | |
8 | ||
9 | use strict; | |
10 | use Carp (); | |
11 | ||
12 | sub new | |
13 | { | |
14 | my($class, %cnf) = @_; | |
15 | ||
16 | # Construct argspecs for the various events | |
17 | my %argspec; | |
18 | for (qw(start end text declaration comment process default)) { | |
19 | my $tmp = delete $cnf{$_}; | |
20 | next unless defined $tmp; | |
21 | $argspec{$_} = $tmp; | |
22 | } | |
23 | Carp::croak("Info not collected for any events") | |
24 | unless %argspec; | |
25 | ||
26 | my $file = delete $cnf{file}; | |
27 | my $doc = delete $cnf{doc}; | |
28 | Carp::croak("Can't parse from both 'doc' and 'file' at the same time") | |
29 | if defined($file) && defined($doc); | |
30 | Carp::croak("No 'doc' or 'file' given to parse from") | |
31 | unless defined($file) || defined($doc); | |
32 | ||
33 | # Create object | |
34 | $cnf{api_version} = 3; | |
35 | my $self = $class->SUPER::new(%cnf); | |
36 | ||
37 | my $accum = $self->{pullparser_accum} = []; | |
38 | while (my($event, $argspec) = each %argspec) { | |
39 | $self->SUPER::handler($event => $accum, $argspec); | |
40 | } | |
41 | ||
42 | if (defined $doc) { | |
43 | $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc; | |
44 | $self->{pullparser_str_pos} = 0; | |
45 | } | |
46 | else { | |
47 | if (!ref($file) && ref(\$file) ne "GLOB") { | |
48 | require IO::File; | |
49 | $file = IO::File->new($file, "r") || return; | |
50 | } | |
51 | ||
52 | $self->{pullparser_file} = $file; | |
53 | } | |
54 | $self; | |
55 | } | |
56 | ||
57 | ||
58 | sub handler | |
59 | { | |
60 | Carp::croak("Can't set handlers for HTML::PullParser"); | |
61 | } | |
62 | ||
63 | ||
64 | sub get_token | |
65 | { | |
66 | my $self = shift; | |
67 | while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) { | |
68 | if (my $f = $self->{pullparser_file}) { | |
69 | # must try to parse more from the file | |
70 | my $buf; | |
71 | if (read($f, $buf, 512)) { | |
72 | $self->parse($buf); | |
73 | } else { | |
74 | $self->eof; | |
75 | $self->{pullparser_eof}++; | |
76 | delete $self->{pullparser_file}; | |
77 | } | |
78 | } | |
79 | elsif (my $sref = $self->{pullparser_str_ref}) { | |
80 | # must try to parse more from the scalar | |
81 | my $pos = $self->{pullparser_str_pos}; | |
82 | my $chunk = substr($$sref, $pos, 512); | |
83 | $self->parse($chunk); | |
84 | $pos += length($chunk); | |
85 | if ($pos < length($$sref)) { | |
86 | $self->{pullparser_str_pos} = $pos; | |
87 | } | |
88 | else { | |
89 | $self->eof; | |
90 | $self->{pullparser_eof}++; | |
91 | delete $self->{pullparser_str_ref}; | |
92 | delete $self->{pullparser_str_pos}; | |
93 | } | |
94 | } | |
95 | else { | |
96 | die; | |
97 | } | |
98 | } | |
99 | shift @{$self->{pullparser_accum}}; | |
100 | } | |
101 | ||
102 | ||
103 | sub unget_token | |
104 | { | |
105 | my $self = shift; | |
106 | unshift @{$self->{pullparser_accum}}, @_; | |
107 | $self; | |
108 | } | |
109 | ||
110 | 1; | |
111 | ||
112 | ||
113 | __END__ | |
114 | ||
115 | =head1 NAME | |
116 | ||
117 | HTML::PullParser - Alternative HTML::Parser interface | |
118 | ||
119 | =head1 SYNOPSIS | |
120 | ||
121 | use HTML::PullParser; | |
122 | ||
123 | $p = HTML::PullParser->new(file => "index.html", | |
124 | start => 'event, tagname, @attr', | |
125 | end => 'event, tagname', | |
126 | ignore_elements => [qw(script style)], | |
127 | ) || die "Can't open: $!"; | |
128 | while (my $token = $p->get_token) { | |
129 | #...do something with $token | |
130 | } | |
131 | ||
132 | =head1 DESCRIPTION | |
133 | ||
134 | The HTML::PullParser is an alternative interface to the HTML::Parser class. | |
135 | It basically turns the HTML::Parser inside out. You associate a file | |
136 | (or any IO::Handle object or string) with the parser at construction time and | |
137 | then repeatedly call $parser->get_token to obtain the tags and text | |
138 | found in the parsed document. | |
139 | ||
140 | The following methods are provided: | |
141 | ||
142 | =over 4 | |
143 | ||
144 | =item $p = HTML::PullParser->new( file => $file, %options ) | |
145 | ||
146 | =item $p = HTML::PullParser->new( doc => \$doc, %options ) | |
147 | ||
148 | A C<HTML::PullParser> can be made to parse from either a file or a | |
149 | literal document based on whether the C<file> or C<doc> option is | |
150 | passed to the parser's constructor. | |
151 | ||
152 | The C<file> passed in can either be a file name or a file handle | |
153 | object. If a file name is passed, and it can't be opened for reading, | |
154 | then the constructor will return an undefined value and $! will tell | |
155 | you why it failed. Otherwise the argument is taken to be some object | |
156 | that the C<HTML::PullParser> can read() from when it needs more data. | |
157 | The stream will be read() until EOF, but not closed. | |
158 | ||
159 | A C<doc> can be passed plain or as a reference | |
160 | to a scalar. If a reference is passed then the value of this scalar | |
161 | should not be changed before all tokens have been extracted. | |
162 | ||
163 | Next the information to be returned for the different token types must | |
164 | be set up. This is done by simply assosiating an argspec (as defined | |
165 | in L<HTML::Parser>) with the events you have an interrest in. For | |
166 | instance, if you want C<start> tokens to be reported as the string | |
167 | C<'S'> followed by the tagname and the attributes you might pass an | |
168 | C<start>-option like this: | |
169 | ||
170 | $p = HTML::Parser-New( doc => $doc_to_parse, | |
171 | start => '"S", tagname, @attr', | |
172 | end => '"E", tagname', | |
173 | ); | |
174 | ||
175 | At last other C<HTML::Parser> options, like C<ignore_tags>, and | |
176 | C<unbroken_text>, can be passed in. Note that you should not use the | |
177 | I<event>_h options to set up parser handlers. | |
178 | ||
179 | =item $token = $p->get_token | |
180 | ||
181 | This method will return the next I<token> found in the HTML document, | |
182 | or C<undef> at the end of the document. The token is usually returned | |
183 | as an array reference. The content of this array match the argspec | |
184 | set up during C<HTML::PullParser> construction. | |
185 | ||
186 | =item $p->unget_token($token,...) | |
187 | ||
188 | If you find out you have read too many tokens you can push them back, | |
189 | so that they are returned again the next time $p->get_token is called. | |
190 | ||
191 | =head1 EXAMPLES | |
192 | ||
193 | The 'eg/hform' script shows how we might parse the form section of | |
194 | HTML::Documents using HTML::PullParser. | |
195 | ||
196 | =head1 SEE ALSO | |
197 | ||
198 | L<HTML::Parser>, L<HTML::TokeParser> | |
199 | ||
200 | =head1 COPYRIGHT | |
201 | ||
202 | Copyright 1998-2001 Gisle Aas. | |
203 | ||
204 | This library is free software; you can redistribute it and/or | |
205 | modify it under the same terms as Perl itself. | |
206 | ||
207 | =cut |