Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | package HTML::TableExtract; |
2 | ||
3 | # This package extracts tables from HTML. Tables of interest may be | |
4 | # specified using header information, depth, order in a depth, or some | |
5 | # combination of the three. See the POD for more information. | |
6 | # | |
7 | # Author: Matthew P. Sisk. See the POD for copyright information. | |
8 | ||
9 | use strict; | |
10 | use Carp; | |
11 | ||
12 | use vars qw($VERSION @ISA); | |
13 | ||
14 | $VERSION = '1.08'; | |
15 | ||
16 | use HTML::Parser; | |
17 | @ISA = qw(HTML::Parser); | |
18 | ||
19 | use HTML::Entities; | |
20 | ||
21 | my %Defaults = ( | |
22 | headers => undef, | |
23 | depth => undef, | |
24 | count => undef, | |
25 | chain => undef, | |
26 | subtables => undef, | |
27 | gridmap => 1, | |
28 | decode => 1, | |
29 | automap => 1, | |
30 | br_translate => 1, | |
31 | head_include => 0, | |
32 | elastic => 1, | |
33 | keep => 0, | |
34 | keepall => 0, | |
35 | debug => 0, | |
36 | keep_html => 0, | |
37 | ); | |
38 | my $Dpat = join('|', keys %Defaults); | |
39 | ||
40 | ### Constructor | |
41 | ||
42 | sub new { | |
43 | my $that = shift; | |
44 | my $class = ref($that) || $that; | |
45 | ||
46 | my(%pass, %parms, $k, $v); | |
47 | while (($k,$v) = splice(@_, 0, 2)) { | |
48 | if ($k eq 'headers' || $k eq 'chain') { | |
49 | ref $v eq 'ARRAY' | |
50 | or croak "Param '$k' must be passed in ref to array\n"; | |
51 | if ($k eq 'chain') { | |
52 | # Filter out non-links (has refs...allows for extra commas, etc) | |
53 | @$v = grep(ref eq 'HASH', @$v); | |
54 | } | |
55 | $parms{$k} = $v; | |
56 | } | |
57 | elsif ($k =~ /^$Dpat$/) { | |
58 | $parms{$k} = $v; | |
59 | } | |
60 | else { | |
61 | $pass{$k} = $v; | |
62 | } | |
63 | } | |
64 | ||
65 | my $self = $class->SUPER::new(%pass); | |
66 | bless $self, $class; | |
67 | foreach (keys %parms, keys %Defaults) { | |
68 | $self->{$_} = exists $parms{$_} && defined $parms{$_} ? | |
69 | $parms{$_} : $Defaults{$_}; | |
70 | } | |
71 | if ($self->{headers}) { | |
72 | print STDERR "TE here, headers: ", join(',', @{$self->{headers}}),"\n" | |
73 | if $self->{debug}; | |
74 | $self->{gridmap} = 1; | |
75 | } | |
76 | # Initialize counts and containers | |
77 | $self->{_cdepth} = -1; | |
78 | $self->{_tablestack} = []; | |
79 | $self->{_tables} = {}; | |
80 | $self->{_ts_sequential} = []; | |
81 | $self->{_table_mapback} = {}; | |
82 | $self->{_counts} = {}; | |
83 | ||
84 | $self; | |
85 | } | |
86 | ||
87 | ### HTML::Parser overrides | |
88 | ||
89 | sub start { | |
90 | my $self = shift; | |
91 | ||
92 | # Create a new table state if entering a table. | |
93 | if ($_[0] eq 'table') { | |
94 | $self->_enter_table; | |
95 | } | |
96 | ||
97 | # Rows and cells are next. We obviously need not bother checking any | |
98 | # tags if we aren't in a table. | |
99 | if ($self->{_in_a_table}) { | |
100 | my $ts = $self->_current_table_state; | |
101 | my $skiptag = 0; | |
102 | if ($_[0] eq 'tr') { | |
103 | $ts->_enter_row; | |
104 | ++$skiptag; | |
105 | } | |
106 | elsif ($_[0] eq 'td' || $_[0] eq 'th') { | |
107 | $ts->_enter_cell; | |
108 | # Inspect rowspan/colspan attributes, record as necessary for | |
109 | # future column count transforms. | |
110 | if ($self->{gridmap}) { | |
111 | my %attrs = ref $_[1] ? %{$_[1]} : {}; | |
112 | if (exists $attrs{rowspan} || exists $attrs{colspan}) { | |
113 | $ts->_skew($attrs{rowspan} || 1, $attrs{colspan} || 1); | |
114 | } | |
115 | } | |
116 | ++$skiptag; | |
117 | } | |
118 | if ($self->{keep_html} && !$skiptag) { | |
119 | $self->text($_[3]); | |
120 | } | |
121 | } | |
122 | ||
123 | # <br> patch. We like to dispense with HTML, but blindly zapping | |
124 | # <br> will sometimes make the resulting text hard to parse if there | |
125 | # is no newline. Therefore, when enabled, we replace <br> with | |
126 | # newline. Pointed out by Volker Stuerzl <Volker.Stuerzl@gmx.de> | |
127 | if ($_[0] eq 'br' && $self->{br_translate} && !$self->{keep_html}) { | |
128 | $self->text("\n"); | |
129 | } | |
130 | ||
131 | ||
132 | } # end start | |
133 | ||
134 | sub end { | |
135 | my $self = shift; | |
136 | # Don't bother if we're not actually in a table. | |
137 | if ($self->{_in_a_table}) { | |
138 | my $ts = $self->_current_table_state; | |
139 | if ($_[0] eq 'td' || $_[0] eq 'th') { | |
140 | $ts->_exit_cell; | |
141 | } | |
142 | elsif ($_[0] eq 'tr') { | |
143 | $ts->_exit_row; | |
144 | } | |
145 | elsif ($_[0] eq 'table') { | |
146 | $self->_exit_table; | |
147 | } | |
148 | $self->text($_[1]) if $self->{keep_html} && $ts->{in_cell}; | |
149 | } | |
150 | } | |
151 | ||
152 | sub text { | |
153 | my $self = shift; | |
154 | # Don't bother unless we are in a table | |
155 | if ($self->{_in_a_table}) { | |
156 | my $ts = $self->_current_table_state; | |
157 | # Don't bother unless we are in a row or cell | |
158 | return unless $ts->{in_cell}; | |
159 | if ($ts->_text_hungry) { | |
160 | $ts->_taste_text($self->{decode} ? decode_entities($_[0]) : $_[0]); | |
161 | } | |
162 | } | |
163 | } | |
164 | ||
165 | ### End HTML::Parser overrides | |
166 | ||
167 | ### Report Methods | |
168 | ||
169 | sub depths { | |
170 | # Return all depths where valid tables were located. | |
171 | my $self = shift; | |
172 | return () unless ref $self->{_tables}; | |
173 | sort { $a <=> $b } keys %{$self->{_tables}}; | |
174 | } | |
175 | ||
176 | sub counts { | |
177 | # Given a depth, return the counts of all valid tables found therein. | |
178 | my($self, $depth) = @_; | |
179 | defined $depth or croak "Depth required\n"; | |
180 | sort { $a <=> $b } keys %{$self->{_tables}{$depth}}; | |
181 | } | |
182 | ||
183 | sub table { | |
184 | # Return the table content for a particular depth and count | |
185 | shift->table_state(@_)->{content}; | |
186 | } | |
187 | ||
188 | sub table_state { | |
189 | # Return the table state for a particular depth and count | |
190 | my($self, $depth, $count) = @_; | |
191 | defined $depth or croak "Depth required\n"; | |
192 | defined $count or croak "Count required\n"; | |
193 | if (! $self->{_tables}{$depth} || ! $self->{_tables}{$depth}{$count}) { | |
194 | return undef; | |
195 | } | |
196 | $self->{_tables}{$depth}{$count}; | |
197 | } | |
198 | ||
199 | sub rows { | |
200 | # Return the rows for a table. First table found if no table | |
201 | # specified. | |
202 | my($self, $table) = @_; | |
203 | my @tc; | |
204 | if (!$table) { | |
205 | $table = $self->first_table_found; | |
206 | } | |
207 | return () unless ref $table; | |
208 | my $ts = $self->{_table_mapback}{$table}; | |
209 | $ts->rows; | |
210 | } | |
211 | ||
212 | sub first_table_found { | |
213 | shift->first_table_state_found(@_)->{content}; | |
214 | } | |
215 | ||
216 | sub first_table_state_found { | |
217 | my $self = shift; | |
218 | ref $self->{_ts_sequential}[0] ? $self->{_ts_sequential}[0] : {}; | |
219 | } | |
220 | ||
221 | sub tables { | |
222 | # Return content of all valid tables found, in the order that | |
223 | # they were seen. | |
224 | map($_->{content}, shift->table_states(@_)); | |
225 | } | |
226 | ||
227 | sub table_states { | |
228 | # Return all valid table records found, in the order that | |
229 | # they were seen. | |
230 | my $self = shift; | |
231 | @{$self->{_ts_sequential}}; | |
232 | } | |
233 | ||
234 | sub table_coords { | |
235 | # Return the depth and count of a table | |
236 | my($self, $table) = @_; | |
237 | ref $table or croak "Table reference required\n"; | |
238 | my $ts = $self->{_table_mapback}{$table}; | |
239 | return () unless ref $ts; | |
240 | $ts->coords; | |
241 | } | |
242 | ||
243 | sub column_map { | |
244 | # Return the column numbers of a particular table in the same order | |
245 | # as the provided headers. | |
246 | my($self, $table) = @_; | |
247 | if (! defined $table) { | |
248 | $table = $self->first_table_found; | |
249 | } | |
250 | my $ts = $self->{_table_mapback}{$table}; | |
251 | return () unless ref $ts; | |
252 | $ts->column_map; | |
253 | } | |
254 | ||
255 | ### Runtime | |
256 | ||
257 | sub _enter_table { | |
258 | my $self = shift; | |
259 | ||
260 | ++$self->{_cdepth}; | |
261 | ++$self->{_in_a_table}; | |
262 | ||
263 | my $depth = $self->{_cdepth}; | |
264 | ||
265 | # Table states can come and go on the stack...here we retrieve the | |
266 | # table state for the table surrounding the current table tag | |
267 | # (parent table state). If the current table tag belongs to a top | |
268 | # level table, then this will be undef. | |
269 | my $pts = $self->_current_table_state; | |
270 | ||
271 | # Counts are tracked for each depth. Depth count hashes are | |
272 | # maintained for each of the table state objects; descendant | |
273 | # tables accumulate a list of these hashes, all of which track | |
274 | # counts relative to the point of view of that table state. | |
275 | my $counts = ref $pts ? $pts->{counts} : [$self->{_counts}]; | |
276 | foreach (@{$counts}) { | |
277 | my $c = $_; | |
278 | if (exists $_->{$depth}) { | |
279 | ++$_->{$depth}; | |
280 | } | |
281 | else { | |
282 | $_->{$depth} = 0; | |
283 | } | |
284 | } | |
285 | my $count = $self->{_counts}{$depth} || 0; | |
286 | ||
287 | print STDERR "TABLE: cdepth $depth, ccount $count, it: $self->{_in_a_table}\n" | |
288 | if $self->{debug} >= 2; | |
289 | ||
290 | # Umbrella status means that this current table and all of its | |
291 | # descendant tables will be harvested. This can happen when there | |
292 | # exist target conditions with no headers, depth, or count, or | |
293 | # when a particular table has been selected and the subtables | |
294 | # parameter was initially specified. | |
295 | my $umbrella = 0; | |
296 | if (ref $pts) { | |
297 | # If the subtables parameter was specified and the last table was | |
298 | # being harvested, the upcoming table (and therefore all of it's | |
299 | # descendants) is under an umbrella. | |
300 | ++$umbrella if $self->{subtables} && $pts->_active; | |
301 | } | |
302 | if (! defined $self->{depth} && !defined $self->{count} | |
303 | && !$self->{headers} && !$self->{chain}) { | |
304 | ++$umbrella; | |
305 | } | |
306 | ||
307 | # Basic parameters for the soon-to-be-created table state. | |
308 | my %tsparms = ( | |
309 | depth => $depth, | |
310 | count => $count, | |
311 | umbrella => $umbrella, | |
312 | automap => $self->{automap}, | |
313 | elastic => $self->{elastic}, | |
314 | counts => $counts, | |
315 | keep => $self->{keep}, | |
316 | keepall => $self->{keepall}, | |
317 | debug => $self->{debug}, | |
318 | keep_html => $self->{keep_html}, | |
319 | ); | |
320 | ||
321 | # Target constraints. There is no point in passing any of these | |
322 | # along if we are under an umbrella. Notice that with table states, | |
323 | # "depth" and "count" are absolute coordinates recording where this | |
324 | # table was created, whereas "tdepth" and "tcount" are the target | |
325 | # constraints. Headers and chain have no "absolute" meaning, | |
326 | # therefore are passed by the same name. | |
327 | if (!$umbrella) { | |
328 | $tsparms{tdepth} = $self->{depth} if defined $self->{depth}; | |
329 | $tsparms{tcount} = $self->{count} if defined $self->{count}; | |
330 | foreach (qw(headers chain head_include)) { | |
331 | $tsparms{$_} = $self->{$_} if defined $self->{$_}; | |
332 | } | |
333 | } | |
334 | ||
335 | # Abracadabra | |
336 | my $ts = HTML::TableExtract::TableState->new(%tsparms); | |
337 | ||
338 | # Inherit lineage | |
339 | unshift(@{$ts->{lineage}}, @{$pts->{lineage}}) if ref $pts; | |
340 | ||
341 | # Chain evolution from parent table state. Once again, however, | |
342 | # there is no point in passing the chain info along if we are under | |
343 | # an umbrella. These frames are just *potential* matches from the | |
344 | # chain. If no match occurs for a particular frame, then that frame | |
345 | # will simply be passed along to the next generation of table states | |
346 | # unchanged (assuming elastic behavior has not been disabled). Note: | |
347 | # frames based on top level constraints, as opposed to chain | |
348 | # specifications, are formed during TableState instantiation above. | |
349 | $pts->_spawn_frames($ts) if ref $self->{chain} && !$umbrella && ref $pts; | |
350 | ||
351 | # Inform the new table state that there will be no more constraints | |
352 | # forthcoming. | |
353 | $ts->_pre_latch; | |
354 | ||
355 | # Push the newly created and configured table state onto the | |
356 | # stack. This will now be the _current_table_state(). | |
357 | push(@{$self->{_tablestack}}, $ts); | |
358 | } | |
359 | ||
360 | sub _exit_table { | |
361 | my $self = shift; | |
362 | my $ts = $self->_current_table_state; | |
363 | ||
364 | # Last ditch fix for HTML mangle | |
365 | $ts->_exit_cell if $ts->{in_cell}; | |
366 | $ts->_exit_row if $ts->{in_row}; | |
367 | ||
368 | if ($ts->_active) { | |
369 | # Retain our newly captured table, assuming we bothered with it. | |
370 | $self->_add_table_state($ts); | |
371 | print STDERR "Captured table ($ts->{depth},$ts->{count})\n" | |
372 | if $self->{debug} >= 2; | |
373 | } | |
374 | ||
375 | # Restore last table state | |
376 | pop(@{$self->{_tablestack}}); | |
377 | --$self->{_in_a_table}; | |
378 | my $lts = $self->_current_table_state; | |
379 | if (ref $lts) { | |
380 | $self->{_cdepth} = $lts->{depth}; | |
381 | } | |
382 | else { | |
383 | # Back to the top level | |
384 | $self->{_cdepth} = -1; | |
385 | } | |
386 | print STDERR "LEAVE: cdepth: $self->{_cdepth}, ccount: $ts->{count}, it: $self->{_in_a_table}\n" if $self->{debug} >= 2; | |
387 | } | |
388 | ||
389 | sub _add_table_state { | |
390 | my($self, $ts) = @_; | |
391 | croak "Table state ref required\n" unless ref $ts; | |
392 | # Preliminary init sweep to appease -w | |
393 | # | |
394 | # These undefs would exist for empty <TD> since text() never got | |
395 | # called. Don't want to blindly do this in a start('td') because | |
396 | # headers might have vetoed. Also track max row length in case we | |
397 | # need to pad the other rows in gridmap mode. | |
398 | my $cmax = 0; | |
399 | foreach my $r (@{$ts->{content}}) { | |
400 | $cmax = $#$r if $#$r > $cmax; | |
401 | foreach (0 .. $#$r) { | |
402 | $r->[$_] = '' unless defined $r->[$_]; | |
403 | } | |
404 | } | |
405 | # Pad right side of columns if gridmap or header slicing | |
406 | if ($self->{gridmap}) { | |
407 | foreach my $r (@{$ts->{content}}) { | |
408 | grep($r->[$_] = '', $#$r + 1 .. $cmax) if $#$r < $cmax; | |
409 | } | |
410 | } | |
411 | ||
412 | $self->{_tables}{$ts->{depth}}{$ts->{count}} = $ts; | |
413 | $self->{_table_mapback}{$ts->{content}} = $ts; | |
414 | push(@{$self->{_ts_sequential}}, $ts); | |
415 | } | |
416 | ||
417 | sub _current_table_state { | |
418 | my $self = shift; | |
419 | $self->{_tablestack}[$#{$self->{_tablestack}}]; | |
420 | } | |
421 | ||
422 | ########## | |
423 | ||
424 | { | |
425 | ||
426 | package HTML::TableExtract::TableState; | |
427 | ||
428 | use strict; | |
429 | use Carp; | |
430 | ||
431 | sub new { | |
432 | my $that = shift; | |
433 | my $class = ref($that) || $that; | |
434 | # Note: 'depth' and 'count' are where this table were found. | |
435 | # 'tdepth' and 'tcount' are target constraints on which to trigger. | |
436 | # 'headers' represent a target constraint, location independent. | |
437 | my $self = { | |
438 | umbrella => 0, | |
439 | in_row => 0, | |
440 | in_cell => 0, | |
441 | rc => -1, | |
442 | cc => -1, | |
443 | frames => [], | |
444 | content => [], | |
445 | htxt => '', | |
446 | order => [], | |
447 | counts => [{}], | |
448 | debug => 0, | |
449 | }; | |
450 | bless $self, $class; | |
451 | ||
452 | my %parms = @_; | |
453 | ||
454 | # Depth and Count -- this is the absolute address of the table. | |
455 | croak "Absolute depth required\n" unless defined $parms{depth}; | |
456 | croak "Count required\n" unless defined $parms{count}; | |
457 | ||
458 | # Inherit count contexts | |
459 | if ($parms{counts}) { | |
460 | push(@{$self->{counts}}, @{$parms{counts}}); | |
461 | delete $parms{counts}; | |
462 | } | |
463 | ||
464 | foreach (keys %parms) { | |
465 | $self->{$_} = $parms{$_}; | |
466 | } | |
467 | ||
468 | # Register lineage | |
469 | $self->{lineage} = [ "$self->{depth},$self->{count}" ]; | |
470 | ||
471 | # Umbrella is a short circuit. This table and all descendants will | |
472 | # be harvested if the umbrella parameter was asserted. If it was | |
473 | # not, then the initial conditions specified for the new table | |
474 | # state are passed along as the first frame in the chain. | |
475 | if (!$self->{umbrella}) { | |
476 | # Frames are designed to be used when chains are specified. With | |
477 | # no chains specified, there is only a single frame, the global | |
478 | # frame, so frames become a bit redundant. We use the mechanisms | |
479 | # anyway for consistency in the extraction engine. Each frame | |
480 | # contains information that might be relative to a chain | |
481 | # frame. Currently this means depth, count, and headers. | |
482 | my %frame; | |
483 | # Frame depth and count represent target depth and count, in | |
484 | # absolute terms. If present, our initial frame takes these from | |
485 | # the target values in the table state. Unlike frames generated | |
486 | # by chains, the counts hash for the initial frame comes from | |
487 | # the global level (this is necessary since the top-level HTML | |
488 | # document has no table state from which to inherit!). Counts | |
489 | # relative to this frame will be assigned and updated based on | |
490 | # chain links, assuming there are any. | |
491 | $frame{depth} = $self->{tdepth} if exists $self->{tdepth}; | |
492 | $frame{count} = $self->{tcount} if exists $self->{tcount}; | |
493 | $frame{headers} = $self->{headers} if exists $self->{headers}; | |
494 | $frame{counts} = $self->{counts}[$#{$self->{counts}}]; | |
495 | $frame{global} = 1; | |
496 | $frame{terminus} = 1 if $self->{keep}; | |
497 | $frame{heritage} = "($self->{depth},$self->{count})"; | |
498 | $self->_add_frame(\%frame); | |
499 | } | |
500 | else { | |
501 | # Short circuit since we are an umbrella. Activate table state. | |
502 | $self->{active} = 1; | |
503 | } | |
504 | $self; | |
505 | } | |
506 | ||
507 | sub _text_hungry { | |
508 | # Text hungry only means that we are interested in gathering the | |
509 | # text, whether it be for header scanning or harvesting. | |
510 | my $self = shift; | |
511 | return 1 if $self->{umbrella}; | |
512 | return 0 if $self->{prune}; | |
513 | $self->_any_dctrigger; | |
514 | } | |
515 | ||
516 | sub _taste_text { | |
517 | # Gather the provided text, either for header scanning or | |
518 | # harvesting. | |
519 | my($self, $text) = @_; | |
520 | ||
521 | # Calculate and track skew, regardless of whether we actually want | |
522 | # this column or not. | |
523 | my $sc = $self->_skew; | |
524 | ||
525 | # Harvest if trigger conditions have been met in a terminus | |
526 | # frame. If headers have been found, and we are not beneath a | |
527 | # header column, then ignore this text. | |
528 | if ($self->_terminus_trigger && $self->_column_wanted || | |
529 | $self->{umbrella}) { | |
530 | if (defined $text) { # -w appeasement | |
531 | print STDERR "Add text '$text'\n" if $self->{debug} > 3; | |
532 | $self->_add_text($text, $sc); | |
533 | } | |
534 | } | |
535 | # Regardless of whether or not we are harvesting, we still try to | |
536 | # scan for headers in waypoint frames. | |
537 | if (defined $text && $self->_any_headers && !$self->_any_htrigger) { | |
538 | $self->_htxt($text); | |
539 | } | |
540 | 1; | |
541 | } | |
542 | ||
543 | ### Init | |
544 | ||
545 | sub _pre_latch { | |
546 | # This should be called at some point soon after object creation | |
547 | # to inform the table state that there will be no more constraints | |
548 | # added. This way latches can be pre-set if possible for | |
549 | # efficiency. | |
550 | my $self = shift; | |
551 | ||
552 | $self->_trigger_frames; | |
553 | return 0 if $self->{prune}; | |
554 | ||
555 | if ($self->{umbrella}) { | |
556 | ++$self->{dc_trigger}; | |
557 | ++$self->{head_trigger}; | |
558 | ++$self->{trigger}; | |
559 | ++$self->{active}; | |
560 | return; | |
561 | } | |
562 | # The following latches are detectable immediately for a | |
563 | # particular table state. | |
564 | $self->_terminus_dctrigger; | |
565 | $self->_any_dctrigger; | |
566 | $self->_terminus_headers; | |
567 | $self->_any_headers; | |
568 | ||
569 | } | |
570 | ||
571 | ### Latch methods...'terminus' vs 'any' is an important distinction, | |
572 | ### because conditions might only be satisifed for a waypoint | |
573 | ### frame. In this case, the next frame in the chain will be | |
574 | ### created, but the table itself will not be captured. | |
575 | ||
576 | sub _terminus_dctrigger { | |
577 | my $self = shift; | |
578 | return $self->{terminus_dctrigger} if defined $self->{terminus_dctrigger}; | |
579 | $self->{terminus_dctrigger} = $self->_check_dctrigger($self->_terminus_frames); | |
580 | } | |
581 | ||
582 | sub _any_dctrigger { | |
583 | my $self = shift; | |
584 | return $self->{any_dctrigger} if defined $self->{any_dctrigger}; | |
585 | $self->{any_dctrigger} = $self->_check_dctrigger(@{$self->{frames}}); | |
586 | } | |
587 | ||
588 | sub _terminus_headers { | |
589 | my $self = shift; | |
590 | return $self->{terminus_headers} if defined $self->{terminus_headers}; | |
591 | $self->{terminus_headers} = $self->_check_headers($self->_terminus_frames); | |
592 | } | |
593 | ||
594 | sub _any_headers { | |
595 | my $self = shift; | |
596 | return $self->{any_headers} if defined $self->{any_headers}; | |
597 | $self->{any_headers} = $self->_check_headers(@{$self->{frames}}); | |
598 | } | |
599 | ||
600 | sub _terminus_htrigger { | |
601 | # Unlike depth and count, this trigger should only latch on | |
602 | # positive values since each row is to be examined. | |
603 | my $self = shift; | |
604 | return $self->{terminus_htrigger} if $self->{terminus_htrigger}; | |
605 | $self->{terminus_htrigger} = $self->_check_htrigger($self->_terminus_frames); | |
606 | } | |
607 | ||
608 | sub _any_htrigger { | |
609 | my $self = shift; | |
610 | return $self->{any_htrigger} if defined $self->{any_htrigger}; | |
611 | $self->{any_htrigger} = $self->_check_htrigger(@{$self->{frames}}); | |
612 | } | |
613 | ||
614 | sub _terminus_trigger { | |
615 | # This has to be the same frame reporting on dc/header | |
616 | # success. First found is the hero. | |
617 | my $self = shift; | |
618 | return $self->{terminus_trigger} if $self->{terminus_trigger}; | |
619 | $self->{terminus_trigger} = $self->_check_trigger($self->_terminus_frames); | |
620 | } | |
621 | ||
622 | sub _any_trigger { | |
623 | # This has to be the same frame reporting on dc/header | |
624 | # success. First found is the hero. | |
625 | my $self = shift; | |
626 | return $self->{any_trigger} if $self->{any_trigger}; | |
627 | $self->{any_trigger} = $self->_check_trigger(@{$self->{frames}}); | |
628 | } | |
629 | ||
630 | ### Latch engines | |
631 | ||
632 | sub _check_dctrigger { | |
633 | my($self, @frames) = @_; | |
634 | return @frames if $self->{umbrella}; | |
635 | my @dctriggered; | |
636 | foreach my $f (@frames) { | |
637 | my $dc_hit = 1; | |
638 | if ($f->{null}) { | |
639 | # Special case... | |
640 | $dc_hit = 0; | |
641 | } | |
642 | else { | |
643 | if (defined $f->{depth} && $f->{depth} != $self->{depth}) { | |
644 | $dc_hit = 0; | |
645 | } | |
646 | elsif (defined $f->{count}) { | |
647 | $dc_hit = 0; | |
648 | if (exists $f->{counts}{$self->{depth}} && | |
649 | $f->{count} == $f->{counts}{$self->{depth}}) { | |
650 | # Note: frame counts, though relative to chain genesis | |
651 | # depth, are recorded in terms of absolute depths. A | |
652 | # particular counts hash is shared among frames descended | |
653 | # from the same chain instance. | |
654 | $dc_hit = 1; | |
655 | } | |
656 | } | |
657 | } | |
658 | push(@dctriggered, $f) if $dc_hit; | |
659 | } | |
660 | return @dctriggered ? \@dctriggered : undef; | |
661 | } | |
662 | ||
663 | sub _check_htrigger { | |
664 | my($self, @frames) = @_; | |
665 | my @htriggered; | |
666 | foreach my $f (@frames) { | |
667 | if ($f->{headers}) { | |
668 | push(@htriggered, $f) if $f->{head_found}; | |
669 | } | |
670 | else { | |
671 | push(@htriggered, $f); | |
672 | } | |
673 | } | |
674 | @htriggered ? \@htriggered : undef; | |
675 | } | |
676 | ||
677 | sub _check_trigger { | |
678 | # This has to be the same frame reporting on dc/header | |
679 | # success. First found is the hero. | |
680 | my($self, @frames) = @_; | |
681 | return () unless @frames; | |
682 | my $tdct = $self->_check_dctrigger(@frames); | |
683 | my $tht = $self->_check_htrigger(@frames); | |
684 | my %tframes; | |
685 | my %tdc_frames; | |
686 | foreach (ref $tdct ? @$tdct : ()) { | |
687 | $tdc_frames{$_} = $_; | |
688 | $tframes{$_} = $_ unless $tframes{$_}; | |
689 | } | |
690 | my %th_frames; | |
691 | foreach (ref $tht ? @$tht : ()) { | |
692 | $th_frames{$_} = $_; | |
693 | $tframes{$_} = $_ unless $tframes{$_}; | |
694 | } | |
695 | my @frame_order = grep($tframes{$_}, @frames); | |
696 | my @triggered; | |
697 | foreach (@frame_order) { | |
698 | if ($tdc_frames{$_} && $th_frames{$_}) { | |
699 | push(@triggered, $tframes{$_}); | |
700 | } | |
701 | } | |
702 | @triggered ? \@triggered : undef; | |
703 | } | |
704 | ||
705 | sub _check_headers { | |
706 | my($self, @frames) = @_; | |
707 | foreach my $f (@frames) { | |
708 | return 1 if $f->{headers}; | |
709 | } | |
710 | 0; | |
711 | } | |
712 | ||
713 | ### | |
714 | ||
715 | sub _terminus_frames { | |
716 | # Return all frames that are at the end of a chain, or specified | |
717 | # as a terminus. | |
718 | my $self = shift; | |
719 | my @res; | |
720 | foreach (@{$self->{frames}}) { | |
721 | push(@res, $_) if $_->{terminus}; | |
722 | } | |
723 | @res; | |
724 | } | |
725 | ||
726 | ### | |
727 | ||
728 | sub _trigger_frames { | |
729 | # Trigger each frame whose conditions have been met (i.e., rather | |
730 | # than merely detect conditions, set state in the affected frame | |
731 | # as well). | |
732 | my $self = shift; | |
733 | if (!@{$self->{frames}}) { | |
734 | ++$self->{prune}; | |
735 | return 0; | |
736 | } | |
737 | my $t = 0; | |
738 | foreach my $f (@{$self->{frames}}) { | |
739 | if ($f->{triggered}) { | |
740 | ++$t; | |
741 | next; | |
742 | } | |
743 | if ($self->_check_trigger($f)) { | |
744 | ++$t; | |
745 | $f->{triggered} = 1; | |
746 | } | |
747 | } | |
748 | $t; | |
749 | } | |
750 | ||
751 | ### Maintain table context | |
752 | ||
753 | sub _enter_row { | |
754 | my $self = shift; | |
755 | $self->_exit_cell if $self->{in_cell}; | |
756 | $self->_exit_row if $self->{in_row}; | |
757 | ++$self->{rc}; | |
758 | ++$self->{in_row}; | |
759 | ||
760 | # Reset next_col for gridmapping | |
761 | $self->{next_col} = 0; | |
762 | while ($self->{taken}{"$self->{rc},$self->{next_col}"}) { | |
763 | ++$self->{next_col}; | |
764 | } | |
765 | ||
766 | ++$self->{active} if $self->_terminus_trigger; | |
767 | if ($self->{active}) { | |
768 | # Add the new row, unless we're using headers and are still in | |
769 | # the header row | |
770 | push(@{$self->{content}}, []) | |
771 | unless $self->_terminus_headers && $self->_still_in_header_row; | |
772 | } | |
773 | $self->_evolve_frames if $self->_trigger_frames; | |
774 | } | |
775 | ||
776 | sub _exit_row { | |
777 | my $self = shift; | |
778 | if ($self->{in_row}) { | |
779 | $self->_exit_cell if $self->{in_cell}; | |
780 | $self->{in_row} = 0; | |
781 | $self->{cc} = -1; | |
782 | $self->_reset_header_scanners; | |
783 | if ($self->_terminus_headers && $self->_still_in_header_row) { | |
784 | ++$self->{hslurp}; | |
785 | # Store header row number so that we can adjust later (we keep | |
786 | # it around for now in case of skew situations, which are in | |
787 | # absolute row terms) | |
788 | $self->{hrow} = $self->{rc}; | |
789 | } | |
790 | } | |
791 | else { | |
792 | print STDERR "Mangled HTML in table ($self->{depth},$self->{count}), extraneous </TR> ignored after row $self->{rc}\n" | |
793 | if $self->{debug}; | |
794 | } | |
795 | } | |
796 | ||
797 | sub _enter_cell { | |
798 | my $self = shift; | |
799 | $self->_exit_cell if $self->{in_cell}; | |
800 | if (!$self->{in_row}) { | |
801 | # Go ahead and try to recover from mangled HTML, because we | |
802 | # care. | |
803 | print STDERR "Mangled HTML in table ($self->{depth},$self->{count}), inferring <TR> as row $self->{rc}\n" | |
804 | if $self->{debug}; | |
805 | $self->_enter_row; | |
806 | } | |
807 | ++$self->{cc}; | |
808 | ++$self->{in_cell}; | |
809 | } | |
810 | ||
811 | sub _exit_cell { | |
812 | my $self = shift; | |
813 | if ($self->{in_cell}) { | |
814 | # Trigger taste_text just in case this was an empty cell. | |
815 | $self->_taste_text(undef) if $self->_text_hungry; | |
816 | $self->{in_cell} = 0; | |
817 | $self->_hmatch; | |
818 | } | |
819 | else { | |
820 | print STDERR "Mangled HTML in table ($self->{depth},$self->{count}), extraneous </TD> ignored in row $self->{rc}\n" | |
821 | if $self->{debug}; | |
822 | } | |
823 | } | |
824 | ||
825 | ### | |
826 | ||
827 | sub _add_frame { | |
828 | # Add new frames to this table state. | |
829 | my($self, @frames) = @_; | |
830 | return 1 if $self->{umbrella}; | |
831 | foreach my $f (@frames) { | |
832 | ref $f or croak "Hash ref required\n"; | |
833 | ||
834 | if (! exists $f->{depth} && ! exists $f->{count} && ! $f->{headers}) { | |
835 | # Special case. If there were no constraints, then umbrella | |
836 | # gets set. Otherwise, with chains, we want all nodes to | |
837 | # trigger but not become active due to the potential chain | |
838 | # constraint. This is just a heads up. | |
839 | ++$f->{null}; | |
840 | } | |
841 | ||
842 | # Take the opportunity to prune frames that are out of their | |
843 | # depth. Keep in mind, depths are specified in absolute terms | |
844 | # for frames, as opposed to relative terms in chains. | |
845 | if (defined $f->{depth} && $f->{depth} < $self->{depth}) { | |
846 | print STDERR "Pruning frame for depth $f->{depth} at depth $self->{depth}\n" if $self->{debug} > 2; | |
847 | next; | |
848 | } | |
849 | ||
850 | # If we are an intermediary in a chain, we will never trigger a | |
851 | # harvest (well, unless 'keep' was specified, anyway). Avoid | |
852 | # autovivifying here, because $self->{chain} is used as a test | |
853 | # many times. | |
854 | if (ref $self->{chain}) { | |
855 | if (defined $f->{chaindex} && $f->{chaindex} == $#{$self->{chain}}) { | |
856 | ++$f->{terminus}; | |
857 | } | |
858 | } | |
859 | elsif ($f->{global}) { | |
860 | # If there is no chain, the global frame is a terminus. | |
861 | ++$f->{terminus}; | |
862 | } | |
863 | ||
864 | # Scoop all triggers if keepall has been asserted. | |
865 | if ($self->{keepall}) { | |
866 | ++$f->{terminus}; | |
867 | } | |
868 | ||
869 | # Set up header pattern if we have headers. | |
870 | if ($f->{headers}) { | |
871 | my $hstring = '(' . join('|', map("($_)", @{$f->{headers}})) . ')'; | |
872 | print STDERR "HPAT: /$hstring/\n" if $self->{debug} >= 2; | |
873 | $f->{hpat} = $hstring; | |
874 | $self->_reset_hits($f); | |
875 | } | |
876 | ||
877 | if ($self->{debug} > 3) { | |
878 | print STDERR "Adding frame ($f):\n {\n"; | |
879 | foreach (sort keys %$f) { | |
880 | next unless defined $f->{$_}; # appease -w | |
881 | print STDERR " $_ => $f->{$_}\n"; | |
882 | } | |
883 | print STDERR " }\n"; | |
884 | } | |
885 | ||
886 | push(@{$self->{frames}}, $f); | |
887 | } | |
888 | # Activate header state if there were any header conditions in the | |
889 | # frames. | |
890 | $self->_scan_state('headers'); | |
891 | # Arbitrary true return value. | |
892 | scalar @{$self->{frames}}; | |
893 | } | |
894 | ||
895 | # Header stuff | |
896 | ||
897 | sub _htxt { | |
898 | # Accumulate or reset header text. This is shared across all | |
899 | # frames. | |
900 | my $self = shift; | |
901 | if (@_) { | |
902 | if (defined $_[0]) { | |
903 | $self->{htxt} .= $_[0] if $_[0] !~ /^\s*$/; | |
904 | } | |
905 | else { | |
906 | $self->{htxt} = ''; | |
907 | } | |
908 | } | |
909 | $self->{htxt}; | |
910 | } | |
911 | ||
912 | sub _hmatch { | |
913 | # Given the current htxt, test all frames for matches. This *will* | |
914 | # set state in the frames in the event of a match. | |
915 | my $self = shift; | |
916 | my @hits; | |
917 | return 0 unless $self->_any_headers; | |
918 | foreach my $f (@{$self->{frames}}) { | |
919 | next unless $f->{hpat}; | |
920 | if ($self->{htxt} =~ /$f->{hpat}/im) { | |
921 | my $hit = $1; | |
922 | print STDERR "HIT on '$hit' in $self->{htxt} ($self->{rc},$self->{cc})\n" if $self->{debug} >= 4; | |
923 | ++$f->{scanning}; | |
924 | # Get rid of the header segment that matched so we can tell | |
925 | # when we're through with all header patterns. | |
926 | foreach (keys %{$f->{hits_left}}) { | |
927 | if ($hit =~ /$_/im) { | |
928 | delete $f->{hits_left}{$_}; | |
929 | $hit = $_; | |
930 | last; | |
931 | } | |
932 | } | |
933 | push(@hits, $hit); | |
934 | # | |
935 | my $cc = $self->_skew; | |
936 | $f->{hits}{$cc} = $hit; | |
937 | push(@{$f->{order}}, $cc); | |
938 | if (!%{$f->{hits_left}}) { | |
939 | # We have found all headers for this frame, but we won't | |
940 | # start slurping until this row has ended | |
941 | ++$f->{head_found}; | |
942 | $f->{scanning} = undef; | |
943 | } | |
944 | } | |
945 | } | |
946 | # Propogate relevant frame states to overall table state. | |
947 | foreach (qw(head_found scanning)) { | |
948 | $self->_scan_state($_); | |
949 | } | |
950 | # Reset htxt buffer | |
951 | $self->_htxt(undef); | |
952 | ||
953 | wantarray ? @hits : scalar @hits; | |
954 | } | |
955 | ||
956 | # Header and header state booleans | |
957 | ||
958 | sub _scan_state { | |
959 | # This just sets analagous flags on a table state basis | |
960 | # rather than a frame basis, for testing efficiency to | |
961 | # reduce the number of method calls involved. | |
962 | my($self, $state) = @_; | |
963 | foreach (@{$self->{frames}}) { | |
964 | ++$self->{$state} if $_->{state}; | |
965 | } | |
966 | $self->{$state}; | |
967 | } | |
968 | ||
969 | sub _headers { shift->_check_state('headers' ) } | |
970 | sub _head_found { shift->_check_state('head_found') } | |
971 | sub _scanning { shift->_check_state('scanning') } | |
972 | ||
973 | # End header stuff | |
974 | ||
975 | sub _check_state { | |
976 | my($self, $state) = @_; | |
977 | defined $state or croak "State name required\n"; | |
978 | my @frames_with_state; | |
979 | foreach my $f (@{$self->{frames}}) { | |
980 | push(@frames_with_state, $f) if $f->{$state}; | |
981 | } | |
982 | return () unless @frames_with_state; | |
983 | wantarray ? @frames_with_state : $frames_with_state[0]; | |
984 | } | |
985 | ||
986 | # Misc | |
987 | ||
988 | sub _evolve_frames { | |
989 | # Retire frames that were triggered; integrate the next link in | |
990 | # the chain if available. If it was the global frame, or the frame | |
991 | # generated from the last in the chain sequence, then activate the | |
992 | # frame and start a new chain. | |
993 | my $self = shift; | |
994 | return if $self->{evolved}; | |
995 | $self->{newframes} = [] unless $self->{newframes}; | |
996 | foreach my $f (@{$self->{frames}}) { | |
997 | # We're only interested in newly triggered frames. | |
998 | next if !$f->{triggered} || $f->{retired}; | |
999 | my %new; | |
1000 | if ($self->{chain}) { | |
1001 | if ($f->{global}) { | |
1002 | # We are the global frame, and we have a chain. Spawn a new | |
1003 | # chain. | |
1004 | $new{chaindex} = 0; | |
1005 | # Chain counts are always relative to the table state in | |
1006 | # which frame genisis occurred. Table states inherit the | |
1007 | # count contexts of parent table states, so that they can be | |
1008 | # updated (and therefore descendant frames get updated as | |
1009 | # well). Count contexts are represented as hashes with | |
1010 | # depths as keys. This frame-specific hash is shared amongst | |
1011 | # all frames descended from chains started in this table | |
1012 | # state. | |
1013 | $new{heritage} = "($self->{depth},$self->{count})"; | |
1014 | } | |
1015 | elsif (defined $f->{chaindex}) { | |
1016 | # Generate a new frame based on the next link in the chain | |
1017 | # (unless we are the global frame, in which case we initialize | |
1018 | # a new chain since there is no chain link for the global | |
1019 | # frame). | |
1020 | $new{chaindex} = $f->{chaindex} + 1; | |
1021 | # Relative counts always are inherited from chain genesis. We | |
1022 | # pass by reference so siblings can all update the depth | |
1023 | # counts for that chain. | |
1024 | $new{heritage} = $f->{heritage}; | |
1025 | } | |
1026 | } | |
1027 | ||
1028 | if ($f->{terminus}) { | |
1029 | # This is a hit since we matched either in the global frame, | |
1030 | # the last link of the chain, or in a link specified as a | |
1031 | # keeper. | |
1032 | ++$f->{active} unless $f->{null}; | |
1033 | # If there is a chain, start a new one from this match if it | |
1034 | # was the global frame (if we ever decided to have chains | |
1035 | # spawn chains, this would be the place to do it. Currently | |
1036 | # only the global frame spawns chains). | |
1037 | ||
1038 | } | |
1039 | ||
1040 | # Since we triggered, one way or the other this frame is retired. | |
1041 | ++$f->{retired}; | |
1042 | ||
1043 | # Frames always inherit the count context of the table state in | |
1044 | # which they were created. | |
1045 | $new{counts} = $self->{counts}[0]; | |
1046 | ||
1047 | if (defined $new{chaindex}) { | |
1048 | my $link = $self->{chain}[$new{chaindex}]; | |
1049 | # Tables immediately below the current table state are | |
1050 | # considered depth 0 as specified in chains...hence actual | |
1051 | # depth plus one forms the basis for depth 0 in relative | |
1052 | # terms. | |
1053 | $new{depth} = ($self->{depth} + 1) + $link->{depth} | |
1054 | if exists $link->{depth}; | |
1055 | $new{count} = $link->{count} if exists $link->{count}; | |
1056 | $new{headers} = $link->{headers} if exists $link->{headers}; | |
1057 | ++$new{terminus} if $link->{keep}; | |
1058 | if ($self->{debug} > 3) { | |
1059 | print STDERR "New proto frame (in ts $self->{depth},$self->{count}) for chain rule $new{chaindex}\n"; | |
1060 | print STDERR " {\n"; | |
1061 | foreach (sort keys %new) { | |
1062 | print STDERR " $_ => $new{$_}"; | |
1063 | if ($_ eq 'counts') { | |
1064 | print STDERR " ",join(' ', map("$_,$new{counts}{$_}", | |
1065 | sort { $a <=> $b } keys %{$new{counts}})); | |
1066 | } | |
1067 | print STDERR "\n"; | |
1068 | } | |
1069 | print STDERR " }\n"; | |
1070 | } | |
1071 | push(@{$self->{newframes}}, \%new); | |
1072 | } | |
1073 | ||
1074 | } | |
1075 | # See if we're done evolving our frames. | |
1076 | foreach my $f (@{$self->{frames}}) { | |
1077 | return 0 unless $f->{retired}; | |
1078 | } | |
1079 | # If we are, then flag the whole table state as evolved. | |
1080 | ++$self->{evolved}; | |
1081 | } | |
1082 | ||
1083 | sub _spawn_frames { | |
1084 | # Build and pass new frames to a child table state. This involves | |
1085 | # retiring old frames and passing along untriggered and new | |
1086 | # frames. | |
1087 | my($self, $child) = @_; | |
1088 | ref $child or croak "Child table state required\n"; | |
1089 | if ($self->{umbrella}) { | |
1090 | # Don't mess with frames, just pass the umbrella. | |
1091 | ++$child->{umbrella}; | |
1092 | return; | |
1093 | } | |
1094 | ||
1095 | my @frames; | |
1096 | my @fields = qw(chaindex depth count headers counts heritage terminus); | |
1097 | ||
1098 | foreach my $f (@{$self->{frames}}) { | |
1099 | # Not interested in retired frames (which just matched), root | |
1100 | # frames (which get regenerated each time a frame is created), | |
1101 | # or in unmatched frames when not in elastic mode. | |
1102 | next if !$self->{elastic} || $f->{retired}; | |
1103 | my %new; | |
1104 | foreach (grep(exists $f->{$_}, @fields)) { | |
1105 | $new{$_} = $f->{$_}; | |
1106 | } | |
1107 | push(@frames, \%new); | |
1108 | } | |
1109 | ||
1110 | # Always interested in newly created frames. Make sure and pass | |
1111 | # copies, though, so that siblings don't update each others frame | |
1112 | # sets. | |
1113 | foreach my $f (@{$self->{newframes}}) { | |
1114 | my %new; | |
1115 | foreach (grep(exists $f->{$_}, @fields)) { | |
1116 | $new{$_} = $f->{$_}; | |
1117 | } | |
1118 | push(@frames, \%new); | |
1119 | } | |
1120 | ||
1121 | $child->_add_frame(@frames) if @frames; | |
1122 | } | |
1123 | ||
1124 | # Report methods | |
1125 | ||
1126 | sub depth { shift->{depth} } | |
1127 | sub count { shift->{count} } | |
1128 | sub coords { | |
1129 | my $self = shift; | |
1130 | ($self->depth, $self->count); | |
1131 | } | |
1132 | ||
1133 | sub lineage { | |
1134 | my $self = shift; | |
1135 | map([split(',', $_)], @{$self->{lineage}}); | |
1136 | } | |
1137 | ||
1138 | sub rows { | |
1139 | my $self = shift; | |
1140 | if ($self->{automap} && $self->_map_makes_a_difference) { | |
1141 | my @tc; | |
1142 | my @cm = $self->column_map; | |
1143 | foreach (@{$self->{content}}) { | |
1144 | my $r = [@{$_}[@cm]]; | |
1145 | # since there could have been non-existent <TD> we need | |
1146 | # to double check initilization to appease -w | |
1147 | foreach (0 .. $#$r) { | |
1148 | $r->[$_] = '' unless defined $r->[$_]; | |
1149 | } | |
1150 | push(@tc, $r); | |
1151 | } | |
1152 | return @tc; | |
1153 | } | |
1154 | # No remapping | |
1155 | @{$self->{content}}; | |
1156 | } | |
1157 | ||
1158 | sub column_map { | |
1159 | # Return the column numbers of this table in the same order as the | |
1160 | # provided headers. | |
1161 | my $self = shift; | |
1162 | my $tframes = $self->_terminus_trigger; | |
1163 | my $tframe = ref $tframes ? $tframes->[0] : undef; | |
1164 | if ($tframe && $tframe->{headers}) { | |
1165 | # First we order the original column counts by taking a hash | |
1166 | # slice based on the original header order. The resulting | |
1167 | # original column numbers are mapped to the actual content | |
1168 | # indicies since we could have a sparse slice. | |
1169 | my %order; | |
1170 | foreach (keys %{$tframe->{hits}}) { | |
1171 | $order{$tframe->{hits}{$_}} = $_; | |
1172 | } | |
1173 | return @order{@{$tframe->{headers}}}; | |
1174 | } | |
1175 | else { | |
1176 | return 0 .. $#{$self->{content}[0]}; | |
1177 | } | |
1178 | } | |
1179 | ||
1180 | sub _map_makes_a_difference { | |
1181 | my $self = shift; | |
1182 | my $diff = 0; | |
1183 | my @order = $self->column_map; | |
1184 | my @sorder = sort { $a <=> $b } @order; | |
1185 | ++$diff if $#order != $#sorder; | |
1186 | ++$diff if $#sorder != $#{$self->{content}[0]}; | |
1187 | foreach (0 .. $#order) { | |
1188 | if ($order[$_] != $sorder[$_]) { | |
1189 | ++$diff; | |
1190 | last; | |
1191 | } | |
1192 | } | |
1193 | $diff; | |
1194 | } | |
1195 | ||
1196 | sub _add_text { | |
1197 | my($self, $txt, $skew_column) = @_; | |
1198 | # We don't check for $txt being defined, sometimes we want to | |
1199 | # merely insert a placeholder in the content. | |
1200 | my $row = $self->{content}[$#{$self->{content}}]; | |
1201 | if (! defined $row->[$skew_column]) { | |
1202 | # Init to appease -w | |
1203 | $row->[$skew_column] = ''; | |
1204 | } | |
1205 | return unless defined $txt; | |
1206 | $row->[$skew_column] .= $txt; | |
1207 | $txt; | |
1208 | } | |
1209 | ||
1210 | sub _skew { | |
1211 | # Skew registers the effects of rowspan/colspan issues when | |
1212 | # gridmap is enabled. | |
1213 | ||
1214 | my($self, $rspan, $cspan) = @_; | |
1215 | my($r,$c) = ($self->{rc},$self->{cc}); | |
1216 | ||
1217 | if ($self->{debug} > 5) { | |
1218 | print STDERR "($self->{rc},$self->{cc}) Inspecting skew for ($r,$c)"; | |
1219 | print STDERR defined $rspan ? " (set with $rspan,$cspan)\n" : "\n"; | |
1220 | } | |
1221 | ||
1222 | my $sc = $c; | |
1223 | if (! defined $self->{skew_cache}{"$r,$c"}) { | |
1224 | $sc = $self->{next_col} if defined $self->{next_col}; | |
1225 | $self->{skew_cache}{"$r,$c"} = $sc; | |
1226 | my $next_col = $sc + 1; | |
1227 | while ($self->{taken}{"$r,$next_col"}) { | |
1228 | ++$next_col; | |
1229 | } | |
1230 | $self->{next_col} = $next_col; | |
1231 | } | |
1232 | else { | |
1233 | $sc = $self->{skew_cache}{"$r,$c"}; | |
1234 | } | |
1235 | ||
1236 | # If we have span arguments, set skews | |
1237 | if (defined $rspan) { | |
1238 | # Default span is always 1, even if not explicitly stated. | |
1239 | $rspan = 1 unless $rspan; | |
1240 | $cspan = 1 unless $cspan; | |
1241 | --$rspan; | |
1242 | --$cspan; | |
1243 | # 1,1 is a degenerate case, there's nothing to do. | |
1244 | if ($rspan || $cspan) { | |
1245 | foreach my $rs (0 .. $rspan) { | |
1246 | my $cr = $r + $rs; | |
1247 | # If we in the same row as the skewer, the "span" is one less | |
1248 | # because the skewer cell occupies the same row. | |
1249 | my $start_col = $rs ? $sc : $sc + 1; | |
1250 | my $fin_col = $sc + $cspan; | |
1251 | foreach ($start_col .. $fin_col) { | |
1252 | $self->{taken}{"$cr,$_"} = "$r,$sc" unless $self->{taken}{"$cr,$_"}; | |
1253 | } | |
1254 | if (!$rs) { | |
1255 | my $next_col = $fin_col + 1; | |
1256 | while ($self->{taken}{"$cr,$next_col"}) { | |
1257 | ++$next_col; | |
1258 | } | |
1259 | $self->{next_col} = $next_col; | |
1260 | } | |
1261 | } | |
1262 | } | |
1263 | } | |
1264 | ||
1265 | # Grid column number | |
1266 | $sc; | |
1267 | } | |
1268 | ||
1269 | sub _reset_header_scanners { | |
1270 | # When a row ends, this should be called in order to reset frames | |
1271 | # who are in the midst of header scans. | |
1272 | my $self = shift; | |
1273 | my @scanners; | |
1274 | foreach my $f (@{$self->{frames}}) { | |
1275 | next unless $f->{headers} && $f->{scanning}; | |
1276 | if ($self->{debug}) { | |
1277 | my $str = "Incomplete header match in row $self->{rc}, resetting scan"; | |
1278 | $str .= " link $f->{chaindex}" if defined $f->{chaindex}; | |
1279 | $str .= "\n"; | |
1280 | print STDERR $str; | |
1281 | } | |
1282 | push(@scanners, $f); | |
1283 | } | |
1284 | $self->_reset_hits(@scanners) if @scanners; | |
1285 | } | |
1286 | ||
1287 | sub _header_quest { | |
1288 | # Loosely translated: "Should I even bother scanning for header | |
1289 | # matches?" | |
1290 | my $self = shift; | |
1291 | return 0 unless $self->_any_headers && !$self->_head_found; | |
1292 | foreach my $f (@{$self->{frames}}) { | |
1293 | return 1 if $f->{headers} && $f->{dc_trigger}; | |
1294 | } | |
1295 | 0; | |
1296 | } | |
1297 | ||
1298 | sub _still_in_header_row { | |
1299 | my $self = shift; | |
1300 | return 0 unless $self->_terminus_headers; | |
1301 | !$self->{hslurp} && $self->_terminus_htrigger; | |
1302 | } | |
1303 | ||
1304 | # Non waypoint answers | |
1305 | ||
1306 | sub _active { | |
1307 | my $self = shift; | |
1308 | return 1 if $self->{active}; | |
1309 | my @active; | |
1310 | foreach my $f (@{$self->{frames}}) { | |
1311 | push(@active, $f) if $f->{active}; | |
1312 | } | |
1313 | return () unless @active; | |
1314 | ++$self->{active} if @active; | |
1315 | wantarray ? @active : $active[0]; | |
1316 | } | |
1317 | ||
1318 | sub _column_wanted { | |
1319 | my $self = shift; | |
1320 | my $tframes = $self->_terminus_trigger; | |
1321 | my $tframe = ref $tframes ? $tframes->[0] : undef; | |
1322 | return 0 unless $tframe; | |
1323 | my $wanted = 1; | |
1324 | if ($self->_terminus_headers && $self->{hslurp}) { | |
1325 | # If we are using headers, veto the grab unless we are in an | |
1326 | # applicable column beneath one of the headers. | |
1327 | $wanted = 0 | |
1328 | unless exists $tframe->{hits}{$self->_skew}; | |
1329 | } | |
1330 | print STDERR "Want ($self->{rc},$self->{cc}): $wanted\n" | |
1331 | if $self->{debug} > 7; | |
1332 | $wanted; | |
1333 | } | |
1334 | ||
1335 | sub _reset_hits { | |
1336 | # Reset hits in provided frames. WARNING!!! If you do not provide | |
1337 | # frames, all frames will be reset! | |
1338 | my($self, @frames) = @_; | |
1339 | foreach my $frame (@frames ? @frames : @{$self->{frames}}) { | |
1340 | next unless $frame->{headers}; | |
1341 | $frame->{hits} = {}; | |
1342 | $frame->{order} = []; | |
1343 | $frame->{scanning} = undef; | |
1344 | foreach (@{$frame->{headers}}) { | |
1345 | ++$frame->{hits_left}{$_}; | |
1346 | } | |
1347 | } | |
1348 | 1; | |
1349 | } | |
1350 | ||
1351 | } | |
1352 | ||
1353 | 1; | |
1354 | ||
1355 | __END__ | |
1356 | ||
1357 | =head1 NAME | |
1358 | ||
1359 | HTML::TableExtract - Perl extension for extracting the text contained in tables within an HTML document. | |
1360 | ||
1361 | =head1 SYNOPSIS | |
1362 | ||
1363 | # Matched tables are returned as "table state" objects; tables can be | |
1364 | # matched using column headers, depth, count within a depth, or some | |
1365 | # combination of the three. | |
1366 | ||
1367 | # Using column header information. Assume an HTML document with | |
1368 | # tables that have "Date", "Price", and "Cost" somewhere in a | |
1369 | # row. The columns beneath those headings are what you want to | |
1370 | # extract. They will be returned in the same order as you specified | |
1371 | # the headers since 'automap' is enabled by default. | |
1372 | ||
1373 | use HTML::TableExtract; | |
1374 | $te = new HTML::TableExtract( headers => [qw(Date Price Cost)] ); | |
1375 | $te->parse($html_string); | |
1376 | ||
1377 | # Examine all matching tables | |
1378 | foreach $ts ($te->table_states) { | |
1379 | print "Table (", join(',', $ts->coords), "):\n"; | |
1380 | foreach $row ($ts->rows) { | |
1381 | print join(',', @$row), "\n"; | |
1382 | } | |
1383 | } | |
1384 | ||
1385 | # Old style, using top level methods rather than table state objects. | |
1386 | foreach $table ($te->tables) { | |
1387 | print "Table (", join(',', $te->table_coords($table)), "):\n"; | |
1388 | foreach $row ($te->rows($table)) { | |
1389 | print join(',', @$row), "\n"; | |
1390 | } | |
1391 | } | |
1392 | ||
1393 | # Shorthand...top level rows() method assumes the first table found | |
1394 | # in the document if no arguments are supplied. | |
1395 | foreach $row ($te->rows) { | |
1396 | print join(',', @$row), "\n"; | |
1397 | } | |
1398 | ||
1399 | # Using depth and count information. Every table in the document has | |
1400 | # a unique depth and count tuple, so when both are specified it is a | |
1401 | # unique table. Depth and count both begin with 0, so in this case we | |
1402 | # are looking for a table (depth 2) within a table (depth 1) within a | |
1403 | # table (depth 0, which is the top level HTML document). In addition, | |
1404 | # it must be the third (count 2) such instance of a table at that | |
1405 | # depth. | |
1406 | ||
1407 | $te = new HTML::TableExtract( depth => 2, count => 2 ); | |
1408 | $te->parse($html_string); | |
1409 | foreach $ts ($te->table_states) { | |
1410 | print "Table found at ", join(',', $ts->coords), ":\n"; | |
1411 | foreach $row ($ts->rows) { | |
1412 | print " ", join(',', @$row), "\n"; | |
1413 | } | |
1414 | } | |
1415 | ||
1416 | =head1 DESCRIPTION | |
1417 | ||
1418 | HTML::TableExtract is a subclass of HTML::Parser that serves to | |
1419 | extract the textual information from tables of interest contained | |
1420 | within an HTML document. The text from each extracted table is stored | |
1421 | in tabe state objects which hold the information as an array of arrays | |
1422 | that represent the rows and cells of that table. | |
1423 | ||
1424 | There are three constraints available to specify which tables you | |
1425 | would like to extract from a document: I<Headers>, I<Depth>, and | |
1426 | I<Count>. | |
1427 | ||
1428 | I<Headers>, the most flexible and adaptive of the techniques, involves | |
1429 | specifying text in an array that you expect to appear above the data | |
1430 | in the tables of interest. Once all headers have been located in a row | |
1431 | of that table, all further cells beneath the columns that matched your | |
1432 | headers are extracted. All other columns are ignored: think of it as | |
1433 | vertical slices through a table. In addition, TableExtract | |
1434 | automatically rearranges each row in the same order as the headers you | |
1435 | provided. If you would like to disable this, set I<automap> to 0 | |
1436 | during object creation, and instead rely on the column_map() method to | |
1437 | find out the order in which the headers were found. Furthermore, | |
1438 | TableExtract will automatically compensate for cell span issues so | |
1439 | that columns are really the same columns as you would visually see in | |
1440 | a browser. This behavior can be disabled by setting the I<gridmap> | |
1441 | parameter to 0. HTML is stripped from the entire textual content of a | |
1442 | cell before header matches are attempted -- unless the I<keep_html> | |
1443 | parameter was enabled. | |
1444 | ||
1445 | I<Depth> and I<Count> are more specific ways to specify tables in | |
1446 | relation to one another. I<Depth> represents how deeply a table | |
1447 | resides in other tables. The depth of a top-level table in the | |
1448 | document is 0. A table within a top-level table has a depth of 1, and | |
1449 | so on. Each depth can be thought of as a layer; tables sharing the | |
1450 | same depth are on the same layer. Within each of these layers, | |
1451 | I<Count> represents the order in which a table was seen at that depth, | |
1452 | starting with 0. Providing both a I<depth> and a I<count> will | |
1453 | uniquely specify a table within a document. | |
1454 | ||
1455 | Each of the I<Headers>, I<Depth>, and I<Count> specifications are | |
1456 | cumulative in their effect on the overall extraction. For instance, if | |
1457 | you specify only a I<Depth>, then you get all tables at that depth | |
1458 | (note that these could very well reside in separate higher-level | |
1459 | tables throughout the document since depth extends across tables). If | |
1460 | you specify only a I<Count>, then the tables at that I<Count> from all | |
1461 | depths are returned (i.e., the I<n>th occurrence of a table at each | |
1462 | depth). If you only specify I<Headers>, then you get all tables in the | |
1463 | document containing those column headers. If you have specified | |
1464 | multiple constraints of I<Headers>, I<Depth>, and I<Count>, then each | |
1465 | constraint has veto power over whether a particular table is | |
1466 | extracted. | |
1467 | ||
1468 | If no I<Headers>, I<Depth>, or I<Count> are specified, then all | |
1469 | tables match. | |
1470 | ||
1471 | Text that is gathered from the tables is decoded with HTML::Entities | |
1472 | by default; this can be disabled by setting the I<decode> parameter to | |
1473 | 0. | |
1474 | ||
1475 | =head2 Chains | |
1476 | ||
1477 | Make sure you fully understand the notions of I<depth> and I<count> | |
1478 | before proceeding, because it is about to become a bit more involved. | |
1479 | ||
1480 | Table matches using I<Headers>, I<Depth>, or I<Count> can be chained | |
1481 | together in order to further specify tables relative to one | |
1482 | another. Links in chains are successively applied to tables within | |
1483 | tables. Top level constraints (i.e., I<header>, I<depth>, and I<count> | |
1484 | parameters for the TableExtract object) behave as the first link in | |
1485 | the chain. Additional links are specified using the I<chain> | |
1486 | parameter. Each link in the chain has its own set of constraints. For | |
1487 | example: | |
1488 | ||
1489 | $te = new HTML::TableExtract | |
1490 | ( | |
1491 | headers => [qw(Summary Region)], | |
1492 | chain => [ | |
1493 | { depth => 0, count => 2 }, | |
1494 | { headers => [qw(Part Qty Cost)] } | |
1495 | ], | |
1496 | ); | |
1497 | ||
1498 | The matching process in this case will start with B<all> tables in the | |
1499 | document that have "Summary" and "Region" in their headers. For now, | |
1500 | assume that there was only one table that matched these headers. Each | |
1501 | table contained within that table will be compared to the first link | |
1502 | in the chain. Depth 0 means that a matching table must be immediately | |
1503 | contained within the current table; count 2 means that the matching | |
1504 | table must also be the third at that depth (counts and depths start at | |
1505 | 0). In other words, the next link of the chain will match on the | |
1506 | third table immediately contained within our first matched table. Once | |
1507 | this link matches, then B<all> further tables beneath that table that | |
1508 | have "Part", "Qty", and "Cost" in their headers will match. By | |
1509 | default, it is only tables at the end of the chains that are returned | |
1510 | to the application, so these tables are returned. | |
1511 | ||
1512 | Each time a link in a chain matches a table, an additional context for | |
1513 | I<depth> and I<count> is established. It is perhaps easiest to | |
1514 | visualize a I<context> as a brand-new HTML document, with new depths | |
1515 | and counts to compare to the remaining links in the chain. The top | |
1516 | level HTML document is the first context. Each table in the document | |
1517 | establishes a new context. I<Depth> in a chain link is relative to the | |
1518 | context that the matching table creates (i.e., a link depth of 0 would | |
1519 | be a table immediately contained within the table that matched the | |
1520 | prior link in the chain). Likewise, that same context keeps track of | |
1521 | I<counts> within the new depth scheme for comparison to the remaining | |
1522 | links in the chain. Headers still apply if they are present in a link, | |
1523 | but they are always independent of context. | |
1524 | ||
1525 | As it turns out, specifying a depth and count provides a unique | |
1526 | address for a table within a context. For non-unique constraints, such | |
1527 | as just a depth, or headers, there can be multiple matches for a given | |
1528 | link. In these cases the chain "forks" and attempts to make further | |
1529 | matches within each of these tables. | |
1530 | ||
1531 | By default, chains are I<elastic>. This means that when a particular | |
1532 | link does not match on a table, it is passed down to subtables | |
1533 | unchanged. For example: | |
1534 | ||
1535 | $te = new HTML::TableExtract | |
1536 | ( | |
1537 | headers => [qw(Summary Region)], | |
1538 | chain => [ | |
1539 | { headers => [qw(Part Qty Cost)] } | |
1540 | ], | |
1541 | ); | |
1542 | ||
1543 | If there are intervening tables between the two header queries, they | |
1544 | will be ignored; this query will extract all tables with "Part", | |
1545 | "Qty", and "Cost" in the headers that are contained in any table with | |
1546 | "Summary" and "Region" in its headers, regardless of how embedded the | |
1547 | inner tables are. If you want a chain to be inelastic, you can set the | |
1548 | I<elastic> parameter to 0 for the whole TableExtract object. Using the | |
1549 | same example: | |
1550 | ||
1551 | $te = new HTML::TableExtract | |
1552 | ( | |
1553 | headers => [qw(Summary Region)], | |
1554 | chain => [ | |
1555 | { headers => [qw(Part Qty Cost)] } | |
1556 | ], | |
1557 | elastic => 0, | |
1558 | ); | |
1559 | ||
1560 | In this case, the inner table (Part, Qty, Cost) must be B<immediately> | |
1561 | contained within the outer table (Summary, Region) in order for the | |
1562 | match to take place. This is equivalent to specifying a depth of 0 for | |
1563 | each link in the chain; if you only want particular links to be | |
1564 | inelastic, then simply set their depths to 0. | |
1565 | ||
1566 | By default, only tables that match at the end of the chains are | |
1567 | retained. The intermediate matches along the chain are referred to as | |
1568 | I<waypoints>, and are not extracted by default. A waypoint may be | |
1569 | retained, however, by specifiying the I<keep> parameter in that link | |
1570 | of the chain. This parameter may be specified at the top level as well | |
1571 | if you want to keep tables that match the first set of constraints in | |
1572 | the object. If you want to keep all tables that match along the chain, | |
1573 | the specify the I<keepall> parameter at the top level. | |
1574 | ||
1575 | Are chains overkill? Probably. In reality, nested HTML tables tend not | |
1576 | to be very deep, so there will usually not be much need for lots of | |
1577 | links in a chain. Theoretically, however, chains offer precise | |
1578 | targeting of tables relative to one another, no matter how deeply | |
1579 | nested they are. | |
1580 | ||
1581 | =head2 Pop Quiz | |
1582 | ||
1583 | What happens with the following table extraction? | |
1584 | ||
1585 | $te = new HTML::TableExtract( | |
1586 | chain => [ { depth => 0 } ], | |
1587 | ); | |
1588 | ||
1589 | Answer: All tables that are contained in another table are extracted | |
1590 | from the document. In this case, there were no top-level constraints | |
1591 | specified, which if you recall means that B<all> tables match the | |
1592 | first set of constraints (or non-constraints, in this case!). A depth | |
1593 | of 0 in the next link of the chain means that the matching table must | |
1594 | be immediately contained within the table from a prior match. | |
1595 | ||
1596 | The following is equivalent: | |
1597 | ||
1598 | $te = new HTML::TableExtract( | |
1599 | depth => 1, | |
1600 | subtables => 1, | |
1601 | ) | |
1602 | ||
1603 | The I<subtables> parameter tells TableExtract to scoop up all tables | |
1604 | contained within the matching tables. In conjunction with a depth of | |
1605 | 1, this has the affect of discarding all top-level tables in the | |
1606 | document, which is exactly what occurred in the prior example. | |
1607 | ||
1608 | =head2 Advice | |
1609 | ||
1610 | The main point of this module was to provide a flexible method of | |
1611 | extracting tabular information from HTML documents without relying to | |
1612 | heavily on the document layout. For that reason, I suggest using | |
1613 | I<Headers> whenever possible -- that way, you are anchoring your | |
1614 | extraction on what the document is trying to communicate rather than | |
1615 | some feature of the HTML comprising the document (other than the fact | |
1616 | that the data is contained in a table). | |
1617 | ||
1618 | HTML::TableExtract is a subclass of HTML::Parser, and as such inherits | |
1619 | all of its basic methods. In particular, C<start()>, C<end()>, and | |
1620 | C<text()> are utilized. Feel free to override them, but if you do not | |
1621 | eventually invoke them in the SUPER class with some content, results | |
1622 | are not guaranteed. | |
1623 | ||
1624 | =head1 METHODS | |
1625 | ||
1626 | The following are the top-level methods of the HTML::TableExtract | |
1627 | object. Tables that have matched a query are actually returned as | |
1628 | separate objects of type HTML::TableExtract::TableState. These table | |
1629 | state objects have their own methods, documented further below. There | |
1630 | are some top-level methods that are present for convenience and | |
1631 | backwards compatibility that are nothing more than front-ends for | |
1632 | equivalent table state methods. | |
1633 | ||
1634 | =over | |
1635 | ||
1636 | =head2 Constructor | |
1637 | ||
1638 | =item new() | |
1639 | ||
1640 | Return a new HTML::TableExtract object. Valid attributes are: | |
1641 | ||
1642 | =over | |
1643 | ||
1644 | =item headers | |
1645 | ||
1646 | Passed as an array reference, headers specify strings of interest at | |
1647 | the top of columns within targeted tables. These header strings will | |
1648 | eventually be passed through a non-anchored, case-insensitive regular | |
1649 | expression, so regexp special characters are allowed. The table row | |
1650 | containing the headers is B<not> returned. Columns that are not | |
1651 | beneath one of the provided headers will be ignored. Columns will, by | |
1652 | default, be rearranged into the same order as the headers you provide | |
1653 | (see the I<automap> parameter for more information). Additionally, by | |
1654 | default columns are considered what you would see visually beneath | |
1655 | that header when the table is rendered in a browser. See the | |
1656 | I<gridmap> parameter for more information. HTML within a header is | |
1657 | stripped before the match is attempted, unless the B<keep_html> | |
1658 | parameter was specified. | |
1659 | ||
1660 | =item depth | |
1661 | ||
1662 | Specify how embedded in other tables your tables of interest should | |
1663 | be. Top-level tables in the HTML document have a depth of 0, tables | |
1664 | within top-level tables have a depth of 1, and so on. | |
1665 | ||
1666 | =item count | |
1667 | ||
1668 | Specify which table within each depth you are interested in, beginning | |
1669 | with 0. | |
1670 | ||
1671 | =item chain | |
1672 | ||
1673 | List of additional constraints to be matched sequentially from the top | |
1674 | level constraints. This is a reference to an array of hash | |
1675 | references. Each hash is a link in the chain, and can be specified in | |
1676 | terms of I<depth>, I<count>, and I<headers>. Further modifiers include | |
1677 | I<keep>, which means to retain the table if it would normally be | |
1678 | dropped as a waypoint. | |
1679 | ||
1680 | =item automap | |
1681 | ||
1682 | Automatically applies the ordering reported by column_map() to the | |
1683 | rows returned by rows(). This only makes a difference if you have | |
1684 | specified I<Headers> and they turn out to be in a different order in | |
1685 | the table than what you specified. Automap will rearrange the columns | |
1686 | in the same order as the headers appear. To get the original ordering, | |
1687 | you will need to take another slice of each row using | |
1688 | column_map(). I<automap> is enabled by default. | |
1689 | ||
1690 | =item gridmap | |
1691 | ||
1692 | Controls whether the table contents are returned as a grid or a | |
1693 | tree. ROWSPAN and COLSPAN issues are compensated for, and columns | |
1694 | really are columns. Empty phantom cells are created where they would | |
1695 | have been obscured by ROWSPAN or COLSPAN settings. This really becomes | |
1696 | an issue when extracting columns beneath headers. Enabled by default. | |
1697 | ||
1698 | =item keepall | |
1699 | ||
1700 | Keep all tables that matched along a chain, including tables matched | |
1701 | by top level contraints. By default, waypoints are dropped and only | |
1702 | the matches at the end of the chain are retained. To retain a | |
1703 | particular waypoint along a chain, use the I<keep> parameter in that | |
1704 | link. | |
1705 | ||
1706 | =item elastic | |
1707 | ||
1708 | When set to 0, all links in chains will be treated as though they had | |
1709 | a depth of 0 specified, which means there can be no intervening | |
1710 | unmatched tables between matches on links. | |
1711 | ||
1712 | =item subtables | |
1713 | ||
1714 | Extract all tables within matched tables. | |
1715 | ||
1716 | =item decode | |
1717 | ||
1718 | Automatically decode retrieved text with | |
1719 | HTML::Entities::decode_entities(). Enabled by default. | |
1720 | ||
1721 | =item br_translate | |
1722 | ||
1723 | Translate <br> tags into newlines. Sometimes the remaining text can be | |
1724 | hard to parse if the <br> tag is simply dropped. Enabled by default. | |
1725 | Has no effect if I<keep_html> is enabled. | |
1726 | ||
1727 | =item keep_html | |
1728 | ||
1729 | Return the raw HTML contained in the cell, rather than just the | |
1730 | visible text. Embedded tables are B<not> retained in the HTML | |
1731 | extracted from a cell. Patterns for header matches must take into | |
1732 | account HTML in the string if this option is enabled. | |
1733 | ||
1734 | =item debug | |
1735 | ||
1736 | Prints some debugging information to STDOUT, more for higher values. | |
1737 | ||
1738 | =back | |
1739 | ||
1740 | =head2 Regular Methods | |
1741 | ||
1742 | =item depths() | |
1743 | ||
1744 | Returns all depths that contained matched tables in the document. | |
1745 | ||
1746 | =item counts($depth) | |
1747 | ||
1748 | For a particular depth, returns all counts that contained matched | |
1749 | tables. | |
1750 | ||
1751 | =item table_state($depth, $count) | |
1752 | ||
1753 | For a particular depth and count, return the table state object for | |
1754 | the table found, if any. | |
1755 | ||
1756 | =item table_states() | |
1757 | ||
1758 | Return table state objects for all tables that matched. | |
1759 | ||
1760 | =item first_table_state_found() | |
1761 | ||
1762 | Return the table state object for the first table matched in the | |
1763 | document. | |
1764 | ||
1765 | =head2 TABLE STATE METHODS | |
1766 | ||
1767 | The following methods are invoked from an | |
1768 | HTML::TableExtract::TableState object, such as those returned from the | |
1769 | C<table_states()> method. | |
1770 | ||
1771 | =item rows() | |
1772 | ||
1773 | Return all rows within a matched table. Each row returned is a | |
1774 | reference to an array containing the text of each cell. | |
1775 | ||
1776 | =item depth() | |
1777 | ||
1778 | Return the (absolute) depth at which this table was found. | |
1779 | ||
1780 | =item count() | |
1781 | ||
1782 | Return the count for this table within the depth it was found. | |
1783 | ||
1784 | =item coords() | |
1785 | ||
1786 | Return depth and count in a list. | |
1787 | ||
1788 | =item column_map() | |
1789 | ||
1790 | Return the order (via indices) in which the provided headers were | |
1791 | found. These indices can be used as slices on rows to either order the | |
1792 | rows in the same order as headers or restore the rows to their natural | |
1793 | order, depending on whether the rows have been pre-adjusted using the | |
1794 | I<automap> parameter. | |
1795 | ||
1796 | =item lineage() | |
1797 | ||
1798 | Returns the path of matched tables that led to matching this | |
1799 | table. Lineage only makes sense if chains were used. Tables that were | |
1800 | not matched by a link in the chain are not included in lineage. The | |
1801 | lineage path is a list of array refs containing depth and count values | |
1802 | for each table involved. | |
1803 | ||
1804 | =head2 Procedural Methods | |
1805 | ||
1806 | The following top level methods are alternatives to invoking methods | |
1807 | in a table state object. If you do not want to deal with table state | |
1808 | objects, then these methods are for you. The "tables" they deal in are | |
1809 | actually just arrays of arrays, which happen to be the current | |
1810 | internal data structure of the table state objects. They are here for | |
1811 | backwards compatibility. | |
1812 | ||
1813 | =item table($depth, $count) | |
1814 | ||
1815 | Same as C<table_state()>, but returns the internal data structure | |
1816 | rather than the table state object. | |
1817 | ||
1818 | =item tables() | |
1819 | ||
1820 | Same as C<table_states()>, but returns the data structures rather than | |
1821 | the table state objects. | |
1822 | ||
1823 | =item first_table_found() | |
1824 | ||
1825 | Same as C<first_table_state_found()>, except returns the data | |
1826 | structure for first table that matched. | |
1827 | ||
1828 | =item table_coords($table) | |
1829 | ||
1830 | Returns the depth and count for a particular table data structure. See | |
1831 | the C<coords()> method provided by table state objects. | |
1832 | ||
1833 | =item rows() | |
1834 | ||
1835 | =item rows($table) | |
1836 | ||
1837 | Return a lsit of the rows for a particular table data structure (first | |
1838 | table found by default). See the C<rows()> method provided by table | |
1839 | state objects. | |
1840 | ||
1841 | =item column_map() | |
1842 | ||
1843 | =item column_map($table) | |
1844 | ||
1845 | Return the column map for a particular table data structure (first | |
1846 | found by default). See the C<column_map()> method provided by table | |
1847 | state objects. | |
1848 | ||
1849 | =back | |
1850 | ||
1851 | =head1 REQUIRES | |
1852 | ||
1853 | HTML::Parser(3), HTML::Entities(3) | |
1854 | ||
1855 | =head1 AUTHOR | |
1856 | ||
1857 | Matthew P. Sisk, E<lt>F<sisk@mojotoad.com>E<gt> | |
1858 | ||
1859 | =head1 COPYRIGHT | |
1860 | ||
1861 | Copyright (c) 2000-2002 Matthew P. Sisk. | |
1862 | All rights reserved. All wrongs revenged. This program is free | |
1863 | software; you can redistribute it and/or modify it under the | |
1864 | same terms as Perl itself. | |
1865 | ||
1866 | =head1 SEE ALSO | |
1867 | ||
1868 | HTML::Parser(3), perl(1). | |
1869 | ||
1870 | =cut | |
1871 | ||
1872 | In honor of fragmented markup languages and sugar mining: | |
1873 | ||
1874 | The Good and The Bad | |
1875 | Ted Hawkins (1936-1994) | |
1876 | ||
1877 | Living is good | |
1878 | when you have someone to share it with | |
1879 | Laughter is bad | |
1880 | when there is no one there to share it with | |
1881 | Talking is sad | |
1882 | if you've got no one to talk to | |
1883 | Dying is good | |
1884 | when the one you love grows tired of you | |
1885 | ||
1886 | Sugar is no good | |
1887 | once it's cast among the white sand | |
1888 | What the point | |
1889 | in pulling the gray hairs from among the black strands | |
1890 | When you're old | |
1891 | you shouldn't walk in the fast lane | |
1892 | Oh ain't it useless | |
1893 | to keep trying to draw true love from that man | |
1894 | ||
1895 | He'll hurt you, | |
1896 | Yes just for the sake of hurting you | |
1897 | and he'll hate you | |
1898 | if you try to love him just the same | |
1899 | He'll use you | |
1900 | and everything you have to offer him | |
1901 | On your way girl | |
1902 | Get out and find you someone new |