Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / site_perl / 5.8.0 / HTML / TableExtract.pm
CommitLineData
86530b38
AT
1package HTML::TableExtract;
2
3# This package extracts tables from HTML. Tables of interest may be
4# specified using header information, depth, order in a depth, or some
5# combination of the three. See the POD for more information.
6#
7# Author: Matthew P. Sisk. See the POD for copyright information.
8
9use strict;
10use Carp;
11
12use vars qw($VERSION @ISA);
13
14$VERSION = '1.08';
15
16use HTML::Parser;
17@ISA = qw(HTML::Parser);
18
19use HTML::Entities;
20
21my %Defaults = (
22 headers => undef,
23 depth => undef,
24 count => undef,
25 chain => undef,
26 subtables => undef,
27 gridmap => 1,
28 decode => 1,
29 automap => 1,
30 br_translate => 1,
31 head_include => 0,
32 elastic => 1,
33 keep => 0,
34 keepall => 0,
35 debug => 0,
36 keep_html => 0,
37 );
38my $Dpat = join('|', keys %Defaults);
39
40### Constructor
41
42sub new {
43 my $that = shift;
44 my $class = ref($that) || $that;
45
46 my(%pass, %parms, $k, $v);
47 while (($k,$v) = splice(@_, 0, 2)) {
48 if ($k eq 'headers' || $k eq 'chain') {
49 ref $v eq 'ARRAY'
50 or croak "Param '$k' must be passed in ref to array\n";
51 if ($k eq 'chain') {
52 # Filter out non-links (has refs...allows for extra commas, etc)
53 @$v = grep(ref eq 'HASH', @$v);
54 }
55 $parms{$k} = $v;
56 }
57 elsif ($k =~ /^$Dpat$/) {
58 $parms{$k} = $v;
59 }
60 else {
61 $pass{$k} = $v;
62 }
63 }
64
65 my $self = $class->SUPER::new(%pass);
66 bless $self, $class;
67 foreach (keys %parms, keys %Defaults) {
68 $self->{$_} = exists $parms{$_} && defined $parms{$_} ?
69 $parms{$_} : $Defaults{$_};
70 }
71 if ($self->{headers}) {
72 print STDERR "TE here, headers: ", join(',', @{$self->{headers}}),"\n"
73 if $self->{debug};
74 $self->{gridmap} = 1;
75 }
76 # Initialize counts and containers
77 $self->{_cdepth} = -1;
78 $self->{_tablestack} = [];
79 $self->{_tables} = {};
80 $self->{_ts_sequential} = [];
81 $self->{_table_mapback} = {};
82 $self->{_counts} = {};
83
84 $self;
85}
86
87### HTML::Parser overrides
88
89sub start {
90 my $self = shift;
91
92 # Create a new table state if entering a table.
93 if ($_[0] eq 'table') {
94 $self->_enter_table;
95 }
96
97 # Rows and cells are next. We obviously need not bother checking any
98 # tags if we aren't in a table.
99 if ($self->{_in_a_table}) {
100 my $ts = $self->_current_table_state;
101 my $skiptag = 0;
102 if ($_[0] eq 'tr') {
103 $ts->_enter_row;
104 ++$skiptag;
105 }
106 elsif ($_[0] eq 'td' || $_[0] eq 'th') {
107 $ts->_enter_cell;
108 # Inspect rowspan/colspan attributes, record as necessary for
109 # future column count transforms.
110 if ($self->{gridmap}) {
111 my %attrs = ref $_[1] ? %{$_[1]} : {};
112 if (exists $attrs{rowspan} || exists $attrs{colspan}) {
113 $ts->_skew($attrs{rowspan} || 1, $attrs{colspan} || 1);
114 }
115 }
116 ++$skiptag;
117 }
118 if ($self->{keep_html} && !$skiptag) {
119 $self->text($_[3]);
120 }
121 }
122
123 # <br> patch. We like to dispense with HTML, but blindly zapping
124 # <br> will sometimes make the resulting text hard to parse if there
125 # is no newline. Therefore, when enabled, we replace <br> with
126 # newline. Pointed out by Volker Stuerzl <Volker.Stuerzl@gmx.de>
127 if ($_[0] eq 'br' && $self->{br_translate} && !$self->{keep_html}) {
128 $self->text("\n");
129 }
130
131
132} # end start
133
134sub end {
135 my $self = shift;
136 # Don't bother if we're not actually in a table.
137 if ($self->{_in_a_table}) {
138 my $ts = $self->_current_table_state;
139 if ($_[0] eq 'td' || $_[0] eq 'th') {
140 $ts->_exit_cell;
141 }
142 elsif ($_[0] eq 'tr') {
143 $ts->_exit_row;
144 }
145 elsif ($_[0] eq 'table') {
146 $self->_exit_table;
147 }
148 $self->text($_[1]) if $self->{keep_html} && $ts->{in_cell};
149 }
150}
151
152sub text {
153 my $self = shift;
154 # Don't bother unless we are in a table
155 if ($self->{_in_a_table}) {
156 my $ts = $self->_current_table_state;
157 # Don't bother unless we are in a row or cell
158 return unless $ts->{in_cell};
159 if ($ts->_text_hungry) {
160 $ts->_taste_text($self->{decode} ? decode_entities($_[0]) : $_[0]);
161 }
162 }
163}
164
165### End HTML::Parser overrides
166
167### Report Methods
168
169sub depths {
170 # Return all depths where valid tables were located.
171 my $self = shift;
172 return () unless ref $self->{_tables};
173 sort { $a <=> $b } keys %{$self->{_tables}};
174}
175
176sub counts {
177 # Given a depth, return the counts of all valid tables found therein.
178 my($self, $depth) = @_;
179 defined $depth or croak "Depth required\n";
180 sort { $a <=> $b } keys %{$self->{_tables}{$depth}};
181}
182
183sub table {
184 # Return the table content for a particular depth and count
185 shift->table_state(@_)->{content};
186}
187
188sub table_state {
189 # Return the table state for a particular depth and count
190 my($self, $depth, $count) = @_;
191 defined $depth or croak "Depth required\n";
192 defined $count or croak "Count required\n";
193 if (! $self->{_tables}{$depth} || ! $self->{_tables}{$depth}{$count}) {
194 return undef;
195 }
196 $self->{_tables}{$depth}{$count};
197}
198
199sub rows {
200 # Return the rows for a table. First table found if no table
201 # specified.
202 my($self, $table) = @_;
203 my @tc;
204 if (!$table) {
205 $table = $self->first_table_found;
206 }
207 return () unless ref $table;
208 my $ts = $self->{_table_mapback}{$table};
209 $ts->rows;
210}
211
212sub first_table_found {
213 shift->first_table_state_found(@_)->{content};
214}
215
216sub first_table_state_found {
217 my $self = shift;
218 ref $self->{_ts_sequential}[0] ? $self->{_ts_sequential}[0] : {};
219}
220
221sub tables {
222 # Return content of all valid tables found, in the order that
223 # they were seen.
224 map($_->{content}, shift->table_states(@_));
225}
226
227sub table_states {
228 # Return all valid table records found, in the order that
229 # they were seen.
230 my $self = shift;
231 @{$self->{_ts_sequential}};
232}
233
234sub table_coords {
235 # Return the depth and count of a table
236 my($self, $table) = @_;
237 ref $table or croak "Table reference required\n";
238 my $ts = $self->{_table_mapback}{$table};
239 return () unless ref $ts;
240 $ts->coords;
241}
242
243sub column_map {
244 # Return the column numbers of a particular table in the same order
245 # as the provided headers.
246 my($self, $table) = @_;
247 if (! defined $table) {
248 $table = $self->first_table_found;
249 }
250 my $ts = $self->{_table_mapback}{$table};
251 return () unless ref $ts;
252 $ts->column_map;
253}
254
255### Runtime
256
257sub _enter_table {
258 my $self = shift;
259
260 ++$self->{_cdepth};
261 ++$self->{_in_a_table};
262
263 my $depth = $self->{_cdepth};
264
265 # Table states can come and go on the stack...here we retrieve the
266 # table state for the table surrounding the current table tag
267 # (parent table state). If the current table tag belongs to a top
268 # level table, then this will be undef.
269 my $pts = $self->_current_table_state;
270
271 # Counts are tracked for each depth. Depth count hashes are
272 # maintained for each of the table state objects; descendant
273 # tables accumulate a list of these hashes, all of which track
274 # counts relative to the point of view of that table state.
275 my $counts = ref $pts ? $pts->{counts} : [$self->{_counts}];
276 foreach (@{$counts}) {
277 my $c = $_;
278 if (exists $_->{$depth}) {
279 ++$_->{$depth};
280 }
281 else {
282 $_->{$depth} = 0;
283 }
284 }
285 my $count = $self->{_counts}{$depth} || 0;
286
287 print STDERR "TABLE: cdepth $depth, ccount $count, it: $self->{_in_a_table}\n"
288 if $self->{debug} >= 2;
289
290 # Umbrella status means that this current table and all of its
291 # descendant tables will be harvested. This can happen when there
292 # exist target conditions with no headers, depth, or count, or
293 # when a particular table has been selected and the subtables
294 # parameter was initially specified.
295 my $umbrella = 0;
296 if (ref $pts) {
297 # If the subtables parameter was specified and the last table was
298 # being harvested, the upcoming table (and therefore all of it's
299 # descendants) is under an umbrella.
300 ++$umbrella if $self->{subtables} && $pts->_active;
301 }
302 if (! defined $self->{depth} && !defined $self->{count}
303 && !$self->{headers} && !$self->{chain}) {
304 ++$umbrella;
305 }
306
307 # Basic parameters for the soon-to-be-created table state.
308 my %tsparms = (
309 depth => $depth,
310 count => $count,
311 umbrella => $umbrella,
312 automap => $self->{automap},
313 elastic => $self->{elastic},
314 counts => $counts,
315 keep => $self->{keep},
316 keepall => $self->{keepall},
317 debug => $self->{debug},
318 keep_html => $self->{keep_html},
319 );
320
321 # Target constraints. There is no point in passing any of these
322 # along if we are under an umbrella. Notice that with table states,
323 # "depth" and "count" are absolute coordinates recording where this
324 # table was created, whereas "tdepth" and "tcount" are the target
325 # constraints. Headers and chain have no "absolute" meaning,
326 # therefore are passed by the same name.
327 if (!$umbrella) {
328 $tsparms{tdepth} = $self->{depth} if defined $self->{depth};
329 $tsparms{tcount} = $self->{count} if defined $self->{count};
330 foreach (qw(headers chain head_include)) {
331 $tsparms{$_} = $self->{$_} if defined $self->{$_};
332 }
333 }
334
335 # Abracadabra
336 my $ts = HTML::TableExtract::TableState->new(%tsparms);
337
338 # Inherit lineage
339 unshift(@{$ts->{lineage}}, @{$pts->{lineage}}) if ref $pts;
340
341 # Chain evolution from parent table state. Once again, however,
342 # there is no point in passing the chain info along if we are under
343 # an umbrella. These frames are just *potential* matches from the
344 # chain. If no match occurs for a particular frame, then that frame
345 # will simply be passed along to the next generation of table states
346 # unchanged (assuming elastic behavior has not been disabled). Note:
347 # frames based on top level constraints, as opposed to chain
348 # specifications, are formed during TableState instantiation above.
349 $pts->_spawn_frames($ts) if ref $self->{chain} && !$umbrella && ref $pts;
350
351 # Inform the new table state that there will be no more constraints
352 # forthcoming.
353 $ts->_pre_latch;
354
355 # Push the newly created and configured table state onto the
356 # stack. This will now be the _current_table_state().
357 push(@{$self->{_tablestack}}, $ts);
358}
359
360sub _exit_table {
361 my $self = shift;
362 my $ts = $self->_current_table_state;
363
364 # Last ditch fix for HTML mangle
365 $ts->_exit_cell if $ts->{in_cell};
366 $ts->_exit_row if $ts->{in_row};
367
368 if ($ts->_active) {
369 # Retain our newly captured table, assuming we bothered with it.
370 $self->_add_table_state($ts);
371 print STDERR "Captured table ($ts->{depth},$ts->{count})\n"
372 if $self->{debug} >= 2;
373 }
374
375 # Restore last table state
376 pop(@{$self->{_tablestack}});
377 --$self->{_in_a_table};
378 my $lts = $self->_current_table_state;
379 if (ref $lts) {
380 $self->{_cdepth} = $lts->{depth};
381 }
382 else {
383 # Back to the top level
384 $self->{_cdepth} = -1;
385 }
386 print STDERR "LEAVE: cdepth: $self->{_cdepth}, ccount: $ts->{count}, it: $self->{_in_a_table}\n" if $self->{debug} >= 2;
387}
388
389sub _add_table_state {
390 my($self, $ts) = @_;
391 croak "Table state ref required\n" unless ref $ts;
392 # Preliminary init sweep to appease -w
393 #
394 # These undefs would exist for empty <TD> since text() never got
395 # called. Don't want to blindly do this in a start('td') because
396 # headers might have vetoed. Also track max row length in case we
397 # need to pad the other rows in gridmap mode.
398 my $cmax = 0;
399 foreach my $r (@{$ts->{content}}) {
400 $cmax = $#$r if $#$r > $cmax;
401 foreach (0 .. $#$r) {
402 $r->[$_] = '' unless defined $r->[$_];
403 }
404 }
405 # Pad right side of columns if gridmap or header slicing
406 if ($self->{gridmap}) {
407 foreach my $r (@{$ts->{content}}) {
408 grep($r->[$_] = '', $#$r + 1 .. $cmax) if $#$r < $cmax;
409 }
410 }
411
412 $self->{_tables}{$ts->{depth}}{$ts->{count}} = $ts;
413 $self->{_table_mapback}{$ts->{content}} = $ts;
414 push(@{$self->{_ts_sequential}}, $ts);
415}
416
417sub _current_table_state {
418 my $self = shift;
419 $self->{_tablestack}[$#{$self->{_tablestack}}];
420}
421
422##########
423
424{
425
426 package HTML::TableExtract::TableState;
427
428 use strict;
429 use Carp;
430
431 sub new {
432 my $that = shift;
433 my $class = ref($that) || $that;
434 # Note: 'depth' and 'count' are where this table were found.
435 # 'tdepth' and 'tcount' are target constraints on which to trigger.
436 # 'headers' represent a target constraint, location independent.
437 my $self = {
438 umbrella => 0,
439 in_row => 0,
440 in_cell => 0,
441 rc => -1,
442 cc => -1,
443 frames => [],
444 content => [],
445 htxt => '',
446 order => [],
447 counts => [{}],
448 debug => 0,
449 };
450 bless $self, $class;
451
452 my %parms = @_;
453
454 # Depth and Count -- this is the absolute address of the table.
455 croak "Absolute depth required\n" unless defined $parms{depth};
456 croak "Count required\n" unless defined $parms{count};
457
458 # Inherit count contexts
459 if ($parms{counts}) {
460 push(@{$self->{counts}}, @{$parms{counts}});
461 delete $parms{counts};
462 }
463
464 foreach (keys %parms) {
465 $self->{$_} = $parms{$_};
466 }
467
468 # Register lineage
469 $self->{lineage} = [ "$self->{depth},$self->{count}" ];
470
471 # Umbrella is a short circuit. This table and all descendants will
472 # be harvested if the umbrella parameter was asserted. If it was
473 # not, then the initial conditions specified for the new table
474 # state are passed along as the first frame in the chain.
475 if (!$self->{umbrella}) {
476 # Frames are designed to be used when chains are specified. With
477 # no chains specified, there is only a single frame, the global
478 # frame, so frames become a bit redundant. We use the mechanisms
479 # anyway for consistency in the extraction engine. Each frame
480 # contains information that might be relative to a chain
481 # frame. Currently this means depth, count, and headers.
482 my %frame;
483 # Frame depth and count represent target depth and count, in
484 # absolute terms. If present, our initial frame takes these from
485 # the target values in the table state. Unlike frames generated
486 # by chains, the counts hash for the initial frame comes from
487 # the global level (this is necessary since the top-level HTML
488 # document has no table state from which to inherit!). Counts
489 # relative to this frame will be assigned and updated based on
490 # chain links, assuming there are any.
491 $frame{depth} = $self->{tdepth} if exists $self->{tdepth};
492 $frame{count} = $self->{tcount} if exists $self->{tcount};
493 $frame{headers} = $self->{headers} if exists $self->{headers};
494 $frame{counts} = $self->{counts}[$#{$self->{counts}}];
495 $frame{global} = 1;
496 $frame{terminus} = 1 if $self->{keep};
497 $frame{heritage} = "($self->{depth},$self->{count})";
498 $self->_add_frame(\%frame);
499 }
500 else {
501 # Short circuit since we are an umbrella. Activate table state.
502 $self->{active} = 1;
503 }
504 $self;
505 }
506
507 sub _text_hungry {
508 # Text hungry only means that we are interested in gathering the
509 # text, whether it be for header scanning or harvesting.
510 my $self = shift;
511 return 1 if $self->{umbrella};
512 return 0 if $self->{prune};
513 $self->_any_dctrigger;
514 }
515
516 sub _taste_text {
517 # Gather the provided text, either for header scanning or
518 # harvesting.
519 my($self, $text) = @_;
520
521 # Calculate and track skew, regardless of whether we actually want
522 # this column or not.
523 my $sc = $self->_skew;
524
525 # Harvest if trigger conditions have been met in a terminus
526 # frame. If headers have been found, and we are not beneath a
527 # header column, then ignore this text.
528 if ($self->_terminus_trigger && $self->_column_wanted ||
529 $self->{umbrella}) {
530 if (defined $text) { # -w appeasement
531 print STDERR "Add text '$text'\n" if $self->{debug} > 3;
532 $self->_add_text($text, $sc);
533 }
534 }
535 # Regardless of whether or not we are harvesting, we still try to
536 # scan for headers in waypoint frames.
537 if (defined $text && $self->_any_headers && !$self->_any_htrigger) {
538 $self->_htxt($text);
539 }
540 1;
541 }
542
543 ### Init
544
545 sub _pre_latch {
546 # This should be called at some point soon after object creation
547 # to inform the table state that there will be no more constraints
548 # added. This way latches can be pre-set if possible for
549 # efficiency.
550 my $self = shift;
551
552 $self->_trigger_frames;
553 return 0 if $self->{prune};
554
555 if ($self->{umbrella}) {
556 ++$self->{dc_trigger};
557 ++$self->{head_trigger};
558 ++$self->{trigger};
559 ++$self->{active};
560 return;
561 }
562 # The following latches are detectable immediately for a
563 # particular table state.
564 $self->_terminus_dctrigger;
565 $self->_any_dctrigger;
566 $self->_terminus_headers;
567 $self->_any_headers;
568
569 }
570
571 ### Latch methods...'terminus' vs 'any' is an important distinction,
572 ### because conditions might only be satisifed for a waypoint
573 ### frame. In this case, the next frame in the chain will be
574 ### created, but the table itself will not be captured.
575
576 sub _terminus_dctrigger {
577 my $self = shift;
578 return $self->{terminus_dctrigger} if defined $self->{terminus_dctrigger};
579 $self->{terminus_dctrigger} = $self->_check_dctrigger($self->_terminus_frames);
580 }
581
582 sub _any_dctrigger {
583 my $self = shift;
584 return $self->{any_dctrigger} if defined $self->{any_dctrigger};
585 $self->{any_dctrigger} = $self->_check_dctrigger(@{$self->{frames}});
586 }
587
588 sub _terminus_headers {
589 my $self = shift;
590 return $self->{terminus_headers} if defined $self->{terminus_headers};
591 $self->{terminus_headers} = $self->_check_headers($self->_terminus_frames);
592 }
593
594 sub _any_headers {
595 my $self = shift;
596 return $self->{any_headers} if defined $self->{any_headers};
597 $self->{any_headers} = $self->_check_headers(@{$self->{frames}});
598 }
599
600 sub _terminus_htrigger {
601 # Unlike depth and count, this trigger should only latch on
602 # positive values since each row is to be examined.
603 my $self = shift;
604 return $self->{terminus_htrigger} if $self->{terminus_htrigger};
605 $self->{terminus_htrigger} = $self->_check_htrigger($self->_terminus_frames);
606 }
607
608 sub _any_htrigger {
609 my $self = shift;
610 return $self->{any_htrigger} if defined $self->{any_htrigger};
611 $self->{any_htrigger} = $self->_check_htrigger(@{$self->{frames}});
612 }
613
614 sub _terminus_trigger {
615 # This has to be the same frame reporting on dc/header
616 # success. First found is the hero.
617 my $self = shift;
618 return $self->{terminus_trigger} if $self->{terminus_trigger};
619 $self->{terminus_trigger} = $self->_check_trigger($self->_terminus_frames);
620 }
621
622 sub _any_trigger {
623 # This has to be the same frame reporting on dc/header
624 # success. First found is the hero.
625 my $self = shift;
626 return $self->{any_trigger} if $self->{any_trigger};
627 $self->{any_trigger} = $self->_check_trigger(@{$self->{frames}});
628 }
629
630 ### Latch engines
631
632 sub _check_dctrigger {
633 my($self, @frames) = @_;
634 return @frames if $self->{umbrella};
635 my @dctriggered;
636 foreach my $f (@frames) {
637 my $dc_hit = 1;
638 if ($f->{null}) {
639 # Special case...
640 $dc_hit = 0;
641 }
642 else {
643 if (defined $f->{depth} && $f->{depth} != $self->{depth}) {
644 $dc_hit = 0;
645 }
646 elsif (defined $f->{count}) {
647 $dc_hit = 0;
648 if (exists $f->{counts}{$self->{depth}} &&
649 $f->{count} == $f->{counts}{$self->{depth}}) {
650 # Note: frame counts, though relative to chain genesis
651 # depth, are recorded in terms of absolute depths. A
652 # particular counts hash is shared among frames descended
653 # from the same chain instance.
654 $dc_hit = 1;
655 }
656 }
657 }
658 push(@dctriggered, $f) if $dc_hit;
659 }
660 return @dctriggered ? \@dctriggered : undef;
661 }
662
663 sub _check_htrigger {
664 my($self, @frames) = @_;
665 my @htriggered;
666 foreach my $f (@frames) {
667 if ($f->{headers}) {
668 push(@htriggered, $f) if $f->{head_found};
669 }
670 else {
671 push(@htriggered, $f);
672 }
673 }
674 @htriggered ? \@htriggered : undef;
675 }
676
677 sub _check_trigger {
678 # This has to be the same frame reporting on dc/header
679 # success. First found is the hero.
680 my($self, @frames) = @_;
681 return () unless @frames;
682 my $tdct = $self->_check_dctrigger(@frames);
683 my $tht = $self->_check_htrigger(@frames);
684 my %tframes;
685 my %tdc_frames;
686 foreach (ref $tdct ? @$tdct : ()) {
687 $tdc_frames{$_} = $_;
688 $tframes{$_} = $_ unless $tframes{$_};
689 }
690 my %th_frames;
691 foreach (ref $tht ? @$tht : ()) {
692 $th_frames{$_} = $_;
693 $tframes{$_} = $_ unless $tframes{$_};
694 }
695 my @frame_order = grep($tframes{$_}, @frames);
696 my @triggered;
697 foreach (@frame_order) {
698 if ($tdc_frames{$_} && $th_frames{$_}) {
699 push(@triggered, $tframes{$_});
700 }
701 }
702 @triggered ? \@triggered : undef;
703 }
704
705 sub _check_headers {
706 my($self, @frames) = @_;
707 foreach my $f (@frames) {
708 return 1 if $f->{headers};
709 }
710 0;
711 }
712
713 ###
714
715 sub _terminus_frames {
716 # Return all frames that are at the end of a chain, or specified
717 # as a terminus.
718 my $self = shift;
719 my @res;
720 foreach (@{$self->{frames}}) {
721 push(@res, $_) if $_->{terminus};
722 }
723 @res;
724 }
725
726 ###
727
728 sub _trigger_frames {
729 # Trigger each frame whose conditions have been met (i.e., rather
730 # than merely detect conditions, set state in the affected frame
731 # as well).
732 my $self = shift;
733 if (!@{$self->{frames}}) {
734 ++$self->{prune};
735 return 0;
736 }
737 my $t = 0;
738 foreach my $f (@{$self->{frames}}) {
739 if ($f->{triggered}) {
740 ++$t;
741 next;
742 }
743 if ($self->_check_trigger($f)) {
744 ++$t;
745 $f->{triggered} = 1;
746 }
747 }
748 $t;
749 }
750
751 ### Maintain table context
752
753 sub _enter_row {
754 my $self = shift;
755 $self->_exit_cell if $self->{in_cell};
756 $self->_exit_row if $self->{in_row};
757 ++$self->{rc};
758 ++$self->{in_row};
759
760 # Reset next_col for gridmapping
761 $self->{next_col} = 0;
762 while ($self->{taken}{"$self->{rc},$self->{next_col}"}) {
763 ++$self->{next_col};
764 }
765
766 ++$self->{active} if $self->_terminus_trigger;
767 if ($self->{active}) {
768 # Add the new row, unless we're using headers and are still in
769 # the header row
770 push(@{$self->{content}}, [])
771 unless $self->_terminus_headers && $self->_still_in_header_row;
772 }
773 $self->_evolve_frames if $self->_trigger_frames;
774 }
775
776 sub _exit_row {
777 my $self = shift;
778 if ($self->{in_row}) {
779 $self->_exit_cell if $self->{in_cell};
780 $self->{in_row} = 0;
781 $self->{cc} = -1;
782 $self->_reset_header_scanners;
783 if ($self->_terminus_headers && $self->_still_in_header_row) {
784 ++$self->{hslurp};
785 # Store header row number so that we can adjust later (we keep
786 # it around for now in case of skew situations, which are in
787 # absolute row terms)
788 $self->{hrow} = $self->{rc};
789 }
790 }
791 else {
792 print STDERR "Mangled HTML in table ($self->{depth},$self->{count}), extraneous </TR> ignored after row $self->{rc}\n"
793 if $self->{debug};
794 }
795 }
796
797 sub _enter_cell {
798 my $self = shift;
799 $self->_exit_cell if $self->{in_cell};
800 if (!$self->{in_row}) {
801 # Go ahead and try to recover from mangled HTML, because we
802 # care.
803 print STDERR "Mangled HTML in table ($self->{depth},$self->{count}), inferring <TR> as row $self->{rc}\n"
804 if $self->{debug};
805 $self->_enter_row;
806 }
807 ++$self->{cc};
808 ++$self->{in_cell};
809 }
810
811 sub _exit_cell {
812 my $self = shift;
813 if ($self->{in_cell}) {
814 # Trigger taste_text just in case this was an empty cell.
815 $self->_taste_text(undef) if $self->_text_hungry;
816 $self->{in_cell} = 0;
817 $self->_hmatch;
818 }
819 else {
820 print STDERR "Mangled HTML in table ($self->{depth},$self->{count}), extraneous </TD> ignored in row $self->{rc}\n"
821 if $self->{debug};
822 }
823 }
824
825 ###
826
827 sub _add_frame {
828 # Add new frames to this table state.
829 my($self, @frames) = @_;
830 return 1 if $self->{umbrella};
831 foreach my $f (@frames) {
832 ref $f or croak "Hash ref required\n";
833
834 if (! exists $f->{depth} && ! exists $f->{count} && ! $f->{headers}) {
835 # Special case. If there were no constraints, then umbrella
836 # gets set. Otherwise, with chains, we want all nodes to
837 # trigger but not become active due to the potential chain
838 # constraint. This is just a heads up.
839 ++$f->{null};
840 }
841
842 # Take the opportunity to prune frames that are out of their
843 # depth. Keep in mind, depths are specified in absolute terms
844 # for frames, as opposed to relative terms in chains.
845 if (defined $f->{depth} && $f->{depth} < $self->{depth}) {
846 print STDERR "Pruning frame for depth $f->{depth} at depth $self->{depth}\n" if $self->{debug} > 2;
847 next;
848 }
849
850 # If we are an intermediary in a chain, we will never trigger a
851 # harvest (well, unless 'keep' was specified, anyway). Avoid
852 # autovivifying here, because $self->{chain} is used as a test
853 # many times.
854 if (ref $self->{chain}) {
855 if (defined $f->{chaindex} && $f->{chaindex} == $#{$self->{chain}}) {
856 ++$f->{terminus};
857 }
858 }
859 elsif ($f->{global}) {
860 # If there is no chain, the global frame is a terminus.
861 ++$f->{terminus};
862 }
863
864 # Scoop all triggers if keepall has been asserted.
865 if ($self->{keepall}) {
866 ++$f->{terminus};
867 }
868
869 # Set up header pattern if we have headers.
870 if ($f->{headers}) {
871 my $hstring = '(' . join('|', map("($_)", @{$f->{headers}})) . ')';
872 print STDERR "HPAT: /$hstring/\n" if $self->{debug} >= 2;
873 $f->{hpat} = $hstring;
874 $self->_reset_hits($f);
875 }
876
877 if ($self->{debug} > 3) {
878 print STDERR "Adding frame ($f):\n {\n";
879 foreach (sort keys %$f) {
880 next unless defined $f->{$_}; # appease -w
881 print STDERR " $_ => $f->{$_}\n";
882 }
883 print STDERR " }\n";
884 }
885
886 push(@{$self->{frames}}, $f);
887 }
888 # Activate header state if there were any header conditions in the
889 # frames.
890 $self->_scan_state('headers');
891 # Arbitrary true return value.
892 scalar @{$self->{frames}};
893 }
894
895 # Header stuff
896
897 sub _htxt {
898 # Accumulate or reset header text. This is shared across all
899 # frames.
900 my $self = shift;
901 if (@_) {
902 if (defined $_[0]) {
903 $self->{htxt} .= $_[0] if $_[0] !~ /^\s*$/;
904 }
905 else {
906 $self->{htxt} = '';
907 }
908 }
909 $self->{htxt};
910 }
911
912 sub _hmatch {
913 # Given the current htxt, test all frames for matches. This *will*
914 # set state in the frames in the event of a match.
915 my $self = shift;
916 my @hits;
917 return 0 unless $self->_any_headers;
918 foreach my $f (@{$self->{frames}}) {
919 next unless $f->{hpat};
920 if ($self->{htxt} =~ /$f->{hpat}/im) {
921 my $hit = $1;
922 print STDERR "HIT on '$hit' in $self->{htxt} ($self->{rc},$self->{cc})\n" if $self->{debug} >= 4;
923 ++$f->{scanning};
924 # Get rid of the header segment that matched so we can tell
925 # when we're through with all header patterns.
926 foreach (keys %{$f->{hits_left}}) {
927 if ($hit =~ /$_/im) {
928 delete $f->{hits_left}{$_};
929 $hit = $_;
930 last;
931 }
932 }
933 push(@hits, $hit);
934 #
935 my $cc = $self->_skew;
936 $f->{hits}{$cc} = $hit;
937 push(@{$f->{order}}, $cc);
938 if (!%{$f->{hits_left}}) {
939 # We have found all headers for this frame, but we won't
940 # start slurping until this row has ended
941 ++$f->{head_found};
942 $f->{scanning} = undef;
943 }
944 }
945 }
946 # Propogate relevant frame states to overall table state.
947 foreach (qw(head_found scanning)) {
948 $self->_scan_state($_);
949 }
950 # Reset htxt buffer
951 $self->_htxt(undef);
952
953 wantarray ? @hits : scalar @hits;
954 }
955
956 # Header and header state booleans
957
958 sub _scan_state {
959 # This just sets analagous flags on a table state basis
960 # rather than a frame basis, for testing efficiency to
961 # reduce the number of method calls involved.
962 my($self, $state) = @_;
963 foreach (@{$self->{frames}}) {
964 ++$self->{$state} if $_->{state};
965 }
966 $self->{$state};
967 }
968
969 sub _headers { shift->_check_state('headers' ) }
970 sub _head_found { shift->_check_state('head_found') }
971 sub _scanning { shift->_check_state('scanning') }
972
973 # End header stuff
974
975 sub _check_state {
976 my($self, $state) = @_;
977 defined $state or croak "State name required\n";
978 my @frames_with_state;
979 foreach my $f (@{$self->{frames}}) {
980 push(@frames_with_state, $f) if $f->{$state};
981 }
982 return () unless @frames_with_state;
983 wantarray ? @frames_with_state : $frames_with_state[0];
984 }
985
986 # Misc
987
988 sub _evolve_frames {
989 # Retire frames that were triggered; integrate the next link in
990 # the chain if available. If it was the global frame, or the frame
991 # generated from the last in the chain sequence, then activate the
992 # frame and start a new chain.
993 my $self = shift;
994 return if $self->{evolved};
995 $self->{newframes} = [] unless $self->{newframes};
996 foreach my $f (@{$self->{frames}}) {
997 # We're only interested in newly triggered frames.
998 next if !$f->{triggered} || $f->{retired};
999 my %new;
1000 if ($self->{chain}) {
1001 if ($f->{global}) {
1002 # We are the global frame, and we have a chain. Spawn a new
1003 # chain.
1004 $new{chaindex} = 0;
1005 # Chain counts are always relative to the table state in
1006 # which frame genisis occurred. Table states inherit the
1007 # count contexts of parent table states, so that they can be
1008 # updated (and therefore descendant frames get updated as
1009 # well). Count contexts are represented as hashes with
1010 # depths as keys. This frame-specific hash is shared amongst
1011 # all frames descended from chains started in this table
1012 # state.
1013 $new{heritage} = "($self->{depth},$self->{count})";
1014 }
1015 elsif (defined $f->{chaindex}) {
1016 # Generate a new frame based on the next link in the chain
1017 # (unless we are the global frame, in which case we initialize
1018 # a new chain since there is no chain link for the global
1019 # frame).
1020 $new{chaindex} = $f->{chaindex} + 1;
1021 # Relative counts always are inherited from chain genesis. We
1022 # pass by reference so siblings can all update the depth
1023 # counts for that chain.
1024 $new{heritage} = $f->{heritage};
1025 }
1026 }
1027
1028 if ($f->{terminus}) {
1029 # This is a hit since we matched either in the global frame,
1030 # the last link of the chain, or in a link specified as a
1031 # keeper.
1032 ++$f->{active} unless $f->{null};
1033 # If there is a chain, start a new one from this match if it
1034 # was the global frame (if we ever decided to have chains
1035 # spawn chains, this would be the place to do it. Currently
1036 # only the global frame spawns chains).
1037
1038 }
1039
1040 # Since we triggered, one way or the other this frame is retired.
1041 ++$f->{retired};
1042
1043 # Frames always inherit the count context of the table state in
1044 # which they were created.
1045 $new{counts} = $self->{counts}[0];
1046
1047 if (defined $new{chaindex}) {
1048 my $link = $self->{chain}[$new{chaindex}];
1049 # Tables immediately below the current table state are
1050 # considered depth 0 as specified in chains...hence actual
1051 # depth plus one forms the basis for depth 0 in relative
1052 # terms.
1053 $new{depth} = ($self->{depth} + 1) + $link->{depth}
1054 if exists $link->{depth};
1055 $new{count} = $link->{count} if exists $link->{count};
1056 $new{headers} = $link->{headers} if exists $link->{headers};
1057 ++$new{terminus} if $link->{keep};
1058 if ($self->{debug} > 3) {
1059 print STDERR "New proto frame (in ts $self->{depth},$self->{count}) for chain rule $new{chaindex}\n";
1060 print STDERR " {\n";
1061 foreach (sort keys %new) {
1062 print STDERR " $_ => $new{$_}";
1063 if ($_ eq 'counts') {
1064 print STDERR " ",join(' ', map("$_,$new{counts}{$_}",
1065 sort { $a <=> $b } keys %{$new{counts}}));
1066 }
1067 print STDERR "\n";
1068 }
1069 print STDERR " }\n";
1070 }
1071 push(@{$self->{newframes}}, \%new);
1072 }
1073
1074 }
1075 # See if we're done evolving our frames.
1076 foreach my $f (@{$self->{frames}}) {
1077 return 0 unless $f->{retired};
1078 }
1079 # If we are, then flag the whole table state as evolved.
1080 ++$self->{evolved};
1081 }
1082
1083 sub _spawn_frames {
1084 # Build and pass new frames to a child table state. This involves
1085 # retiring old frames and passing along untriggered and new
1086 # frames.
1087 my($self, $child) = @_;
1088 ref $child or croak "Child table state required\n";
1089 if ($self->{umbrella}) {
1090 # Don't mess with frames, just pass the umbrella.
1091 ++$child->{umbrella};
1092 return;
1093 }
1094
1095 my @frames;
1096 my @fields = qw(chaindex depth count headers counts heritage terminus);
1097
1098 foreach my $f (@{$self->{frames}}) {
1099 # Not interested in retired frames (which just matched), root
1100 # frames (which get regenerated each time a frame is created),
1101 # or in unmatched frames when not in elastic mode.
1102 next if !$self->{elastic} || $f->{retired};
1103 my %new;
1104 foreach (grep(exists $f->{$_}, @fields)) {
1105 $new{$_} = $f->{$_};
1106 }
1107 push(@frames, \%new);
1108 }
1109
1110 # Always interested in newly created frames. Make sure and pass
1111 # copies, though, so that siblings don't update each others frame
1112 # sets.
1113 foreach my $f (@{$self->{newframes}}) {
1114 my %new;
1115 foreach (grep(exists $f->{$_}, @fields)) {
1116 $new{$_} = $f->{$_};
1117 }
1118 push(@frames, \%new);
1119 }
1120
1121 $child->_add_frame(@frames) if @frames;
1122 }
1123
1124 # Report methods
1125
1126 sub depth { shift->{depth} }
1127 sub count { shift->{count} }
1128 sub coords {
1129 my $self = shift;
1130 ($self->depth, $self->count);
1131 }
1132
1133 sub lineage {
1134 my $self = shift;
1135 map([split(',', $_)], @{$self->{lineage}});
1136 }
1137
1138 sub rows {
1139 my $self = shift;
1140 if ($self->{automap} && $self->_map_makes_a_difference) {
1141 my @tc;
1142 my @cm = $self->column_map;
1143 foreach (@{$self->{content}}) {
1144 my $r = [@{$_}[@cm]];
1145 # since there could have been non-existent <TD> we need
1146 # to double check initilization to appease -w
1147 foreach (0 .. $#$r) {
1148 $r->[$_] = '' unless defined $r->[$_];
1149 }
1150 push(@tc, $r);
1151 }
1152 return @tc;
1153 }
1154 # No remapping
1155 @{$self->{content}};
1156 }
1157
1158 sub column_map {
1159 # Return the column numbers of this table in the same order as the
1160 # provided headers.
1161 my $self = shift;
1162 my $tframes = $self->_terminus_trigger;
1163 my $tframe = ref $tframes ? $tframes->[0] : undef;
1164 if ($tframe && $tframe->{headers}) {
1165 # First we order the original column counts by taking a hash
1166 # slice based on the original header order. The resulting
1167 # original column numbers are mapped to the actual content
1168 # indicies since we could have a sparse slice.
1169 my %order;
1170 foreach (keys %{$tframe->{hits}}) {
1171 $order{$tframe->{hits}{$_}} = $_;
1172 }
1173 return @order{@{$tframe->{headers}}};
1174 }
1175 else {
1176 return 0 .. $#{$self->{content}[0]};
1177 }
1178 }
1179
1180 sub _map_makes_a_difference {
1181 my $self = shift;
1182 my $diff = 0;
1183 my @order = $self->column_map;
1184 my @sorder = sort { $a <=> $b } @order;
1185 ++$diff if $#order != $#sorder;
1186 ++$diff if $#sorder != $#{$self->{content}[0]};
1187 foreach (0 .. $#order) {
1188 if ($order[$_] != $sorder[$_]) {
1189 ++$diff;
1190 last;
1191 }
1192 }
1193 $diff;
1194 }
1195
1196 sub _add_text {
1197 my($self, $txt, $skew_column) = @_;
1198 # We don't check for $txt being defined, sometimes we want to
1199 # merely insert a placeholder in the content.
1200 my $row = $self->{content}[$#{$self->{content}}];
1201 if (! defined $row->[$skew_column]) {
1202 # Init to appease -w
1203 $row->[$skew_column] = '';
1204 }
1205 return unless defined $txt;
1206 $row->[$skew_column] .= $txt;
1207 $txt;
1208 }
1209
1210 sub _skew {
1211 # Skew registers the effects of rowspan/colspan issues when
1212 # gridmap is enabled.
1213
1214 my($self, $rspan, $cspan) = @_;
1215 my($r,$c) = ($self->{rc},$self->{cc});
1216
1217 if ($self->{debug} > 5) {
1218 print STDERR "($self->{rc},$self->{cc}) Inspecting skew for ($r,$c)";
1219 print STDERR defined $rspan ? " (set with $rspan,$cspan)\n" : "\n";
1220 }
1221
1222 my $sc = $c;
1223 if (! defined $self->{skew_cache}{"$r,$c"}) {
1224 $sc = $self->{next_col} if defined $self->{next_col};
1225 $self->{skew_cache}{"$r,$c"} = $sc;
1226 my $next_col = $sc + 1;
1227 while ($self->{taken}{"$r,$next_col"}) {
1228 ++$next_col;
1229 }
1230 $self->{next_col} = $next_col;
1231 }
1232 else {
1233 $sc = $self->{skew_cache}{"$r,$c"};
1234 }
1235
1236 # If we have span arguments, set skews
1237 if (defined $rspan) {
1238 # Default span is always 1, even if not explicitly stated.
1239 $rspan = 1 unless $rspan;
1240 $cspan = 1 unless $cspan;
1241 --$rspan;
1242 --$cspan;
1243 # 1,1 is a degenerate case, there's nothing to do.
1244 if ($rspan || $cspan) {
1245 foreach my $rs (0 .. $rspan) {
1246 my $cr = $r + $rs;
1247 # If we in the same row as the skewer, the "span" is one less
1248 # because the skewer cell occupies the same row.
1249 my $start_col = $rs ? $sc : $sc + 1;
1250 my $fin_col = $sc + $cspan;
1251 foreach ($start_col .. $fin_col) {
1252 $self->{taken}{"$cr,$_"} = "$r,$sc" unless $self->{taken}{"$cr,$_"};
1253 }
1254 if (!$rs) {
1255 my $next_col = $fin_col + 1;
1256 while ($self->{taken}{"$cr,$next_col"}) {
1257 ++$next_col;
1258 }
1259 $self->{next_col} = $next_col;
1260 }
1261 }
1262 }
1263 }
1264
1265 # Grid column number
1266 $sc;
1267 }
1268
1269 sub _reset_header_scanners {
1270 # When a row ends, this should be called in order to reset frames
1271 # who are in the midst of header scans.
1272 my $self = shift;
1273 my @scanners;
1274 foreach my $f (@{$self->{frames}}) {
1275 next unless $f->{headers} && $f->{scanning};
1276 if ($self->{debug}) {
1277 my $str = "Incomplete header match in row $self->{rc}, resetting scan";
1278 $str .= " link $f->{chaindex}" if defined $f->{chaindex};
1279 $str .= "\n";
1280 print STDERR $str;
1281 }
1282 push(@scanners, $f);
1283 }
1284 $self->_reset_hits(@scanners) if @scanners;
1285 }
1286
1287 sub _header_quest {
1288 # Loosely translated: "Should I even bother scanning for header
1289 # matches?"
1290 my $self = shift;
1291 return 0 unless $self->_any_headers && !$self->_head_found;
1292 foreach my $f (@{$self->{frames}}) {
1293 return 1 if $f->{headers} && $f->{dc_trigger};
1294 }
1295 0;
1296 }
1297
1298 sub _still_in_header_row {
1299 my $self = shift;
1300 return 0 unless $self->_terminus_headers;
1301 !$self->{hslurp} && $self->_terminus_htrigger;
1302 }
1303
1304 # Non waypoint answers
1305
1306 sub _active {
1307 my $self = shift;
1308 return 1 if $self->{active};
1309 my @active;
1310 foreach my $f (@{$self->{frames}}) {
1311 push(@active, $f) if $f->{active};
1312 }
1313 return () unless @active;
1314 ++$self->{active} if @active;
1315 wantarray ? @active : $active[0];
1316 }
1317
1318 sub _column_wanted {
1319 my $self = shift;
1320 my $tframes = $self->_terminus_trigger;
1321 my $tframe = ref $tframes ? $tframes->[0] : undef;
1322 return 0 unless $tframe;
1323 my $wanted = 1;
1324 if ($self->_terminus_headers && $self->{hslurp}) {
1325 # If we are using headers, veto the grab unless we are in an
1326 # applicable column beneath one of the headers.
1327 $wanted = 0
1328 unless exists $tframe->{hits}{$self->_skew};
1329 }
1330 print STDERR "Want ($self->{rc},$self->{cc}): $wanted\n"
1331 if $self->{debug} > 7;
1332 $wanted;
1333 }
1334
1335 sub _reset_hits {
1336 # Reset hits in provided frames. WARNING!!! If you do not provide
1337 # frames, all frames will be reset!
1338 my($self, @frames) = @_;
1339 foreach my $frame (@frames ? @frames : @{$self->{frames}}) {
1340 next unless $frame->{headers};
1341 $frame->{hits} = {};
1342 $frame->{order} = [];
1343 $frame->{scanning} = undef;
1344 foreach (@{$frame->{headers}}) {
1345 ++$frame->{hits_left}{$_};
1346 }
1347 }
1348 1;
1349 }
1350
1351}
1352
13531;
1354
1355__END__
1356
1357=head1 NAME
1358
1359HTML::TableExtract - Perl extension for extracting the text contained in tables within an HTML document.
1360
1361=head1 SYNOPSIS
1362
1363 # Matched tables are returned as "table state" objects; tables can be
1364 # matched using column headers, depth, count within a depth, or some
1365 # combination of the three.
1366
1367 # Using column header information. Assume an HTML document with
1368 # tables that have "Date", "Price", and "Cost" somewhere in a
1369 # row. The columns beneath those headings are what you want to
1370 # extract. They will be returned in the same order as you specified
1371 # the headers since 'automap' is enabled by default.
1372
1373 use HTML::TableExtract;
1374 $te = new HTML::TableExtract( headers => [qw(Date Price Cost)] );
1375 $te->parse($html_string);
1376
1377 # Examine all matching tables
1378 foreach $ts ($te->table_states) {
1379 print "Table (", join(',', $ts->coords), "):\n";
1380 foreach $row ($ts->rows) {
1381 print join(',', @$row), "\n";
1382 }
1383 }
1384
1385 # Old style, using top level methods rather than table state objects.
1386 foreach $table ($te->tables) {
1387 print "Table (", join(',', $te->table_coords($table)), "):\n";
1388 foreach $row ($te->rows($table)) {
1389 print join(',', @$row), "\n";
1390 }
1391 }
1392
1393 # Shorthand...top level rows() method assumes the first table found
1394 # in the document if no arguments are supplied.
1395 foreach $row ($te->rows) {
1396 print join(',', @$row), "\n";
1397 }
1398
1399 # Using depth and count information. Every table in the document has
1400 # a unique depth and count tuple, so when both are specified it is a
1401 # unique table. Depth and count both begin with 0, so in this case we
1402 # are looking for a table (depth 2) within a table (depth 1) within a
1403 # table (depth 0, which is the top level HTML document). In addition,
1404 # it must be the third (count 2) such instance of a table at that
1405 # depth.
1406
1407 $te = new HTML::TableExtract( depth => 2, count => 2 );
1408 $te->parse($html_string);
1409 foreach $ts ($te->table_states) {
1410 print "Table found at ", join(',', $ts->coords), ":\n";
1411 foreach $row ($ts->rows) {
1412 print " ", join(',', @$row), "\n";
1413 }
1414 }
1415
1416=head1 DESCRIPTION
1417
1418HTML::TableExtract is a subclass of HTML::Parser that serves to
1419extract the textual information from tables of interest contained
1420within an HTML document. The text from each extracted table is stored
1421in tabe state objects which hold the information as an array of arrays
1422that represent the rows and cells of that table.
1423
1424There are three constraints available to specify which tables you
1425would like to extract from a document: I<Headers>, I<Depth>, and
1426I<Count>.
1427
1428I<Headers>, the most flexible and adaptive of the techniques, involves
1429specifying text in an array that you expect to appear above the data
1430in the tables of interest. Once all headers have been located in a row
1431of that table, all further cells beneath the columns that matched your
1432headers are extracted. All other columns are ignored: think of it as
1433vertical slices through a table. In addition, TableExtract
1434automatically rearranges each row in the same order as the headers you
1435provided. If you would like to disable this, set I<automap> to 0
1436during object creation, and instead rely on the column_map() method to
1437find out the order in which the headers were found. Furthermore,
1438TableExtract will automatically compensate for cell span issues so
1439that columns are really the same columns as you would visually see in
1440a browser. This behavior can be disabled by setting the I<gridmap>
1441parameter to 0. HTML is stripped from the entire textual content of a
1442cell before header matches are attempted -- unless the I<keep_html>
1443parameter was enabled.
1444
1445I<Depth> and I<Count> are more specific ways to specify tables in
1446relation to one another. I<Depth> represents how deeply a table
1447resides in other tables. The depth of a top-level table in the
1448document is 0. A table within a top-level table has a depth of 1, and
1449so on. Each depth can be thought of as a layer; tables sharing the
1450same depth are on the same layer. Within each of these layers,
1451I<Count> represents the order in which a table was seen at that depth,
1452starting with 0. Providing both a I<depth> and a I<count> will
1453uniquely specify a table within a document.
1454
1455Each of the I<Headers>, I<Depth>, and I<Count> specifications are
1456cumulative in their effect on the overall extraction. For instance, if
1457you specify only a I<Depth>, then you get all tables at that depth
1458(note that these could very well reside in separate higher-level
1459tables throughout the document since depth extends across tables). If
1460you specify only a I<Count>, then the tables at that I<Count> from all
1461depths are returned (i.e., the I<n>th occurrence of a table at each
1462depth). If you only specify I<Headers>, then you get all tables in the
1463document containing those column headers. If you have specified
1464multiple constraints of I<Headers>, I<Depth>, and I<Count>, then each
1465constraint has veto power over whether a particular table is
1466extracted.
1467
1468If no I<Headers>, I<Depth>, or I<Count> are specified, then all
1469tables match.
1470
1471Text that is gathered from the tables is decoded with HTML::Entities
1472by default; this can be disabled by setting the I<decode> parameter to
14730.
1474
1475=head2 Chains
1476
1477Make sure you fully understand the notions of I<depth> and I<count>
1478before proceeding, because it is about to become a bit more involved.
1479
1480Table matches using I<Headers>, I<Depth>, or I<Count> can be chained
1481together in order to further specify tables relative to one
1482another. Links in chains are successively applied to tables within
1483tables. Top level constraints (i.e., I<header>, I<depth>, and I<count>
1484parameters for the TableExtract object) behave as the first link in
1485the chain. Additional links are specified using the I<chain>
1486parameter. Each link in the chain has its own set of constraints. For
1487example:
1488
1489 $te = new HTML::TableExtract
1490 (
1491 headers => [qw(Summary Region)],
1492 chain => [
1493 { depth => 0, count => 2 },
1494 { headers => [qw(Part Qty Cost)] }
1495 ],
1496 );
1497
1498The matching process in this case will start with B<all> tables in the
1499document that have "Summary" and "Region" in their headers. For now,
1500assume that there was only one table that matched these headers. Each
1501table contained within that table will be compared to the first link
1502in the chain. Depth 0 means that a matching table must be immediately
1503contained within the current table; count 2 means that the matching
1504table must also be the third at that depth (counts and depths start at
15050). In other words, the next link of the chain will match on the
1506third table immediately contained within our first matched table. Once
1507this link matches, then B<all> further tables beneath that table that
1508have "Part", "Qty", and "Cost" in their headers will match. By
1509default, it is only tables at the end of the chains that are returned
1510to the application, so these tables are returned.
1511
1512Each time a link in a chain matches a table, an additional context for
1513I<depth> and I<count> is established. It is perhaps easiest to
1514visualize a I<context> as a brand-new HTML document, with new depths
1515and counts to compare to the remaining links in the chain. The top
1516level HTML document is the first context. Each table in the document
1517establishes a new context. I<Depth> in a chain link is relative to the
1518context that the matching table creates (i.e., a link depth of 0 would
1519be a table immediately contained within the table that matched the
1520prior link in the chain). Likewise, that same context keeps track of
1521I<counts> within the new depth scheme for comparison to the remaining
1522links in the chain. Headers still apply if they are present in a link,
1523but they are always independent of context.
1524
1525As it turns out, specifying a depth and count provides a unique
1526address for a table within a context. For non-unique constraints, such
1527as just a depth, or headers, there can be multiple matches for a given
1528link. In these cases the chain "forks" and attempts to make further
1529matches within each of these tables.
1530
1531By default, chains are I<elastic>. This means that when a particular
1532link does not match on a table, it is passed down to subtables
1533unchanged. For example:
1534
1535 $te = new HTML::TableExtract
1536 (
1537 headers => [qw(Summary Region)],
1538 chain => [
1539 { headers => [qw(Part Qty Cost)] }
1540 ],
1541 );
1542
1543If there are intervening tables between the two header queries, they
1544will be ignored; this query will extract all tables with "Part",
1545"Qty", and "Cost" in the headers that are contained in any table with
1546"Summary" and "Region" in its headers, regardless of how embedded the
1547inner tables are. If you want a chain to be inelastic, you can set the
1548I<elastic> parameter to 0 for the whole TableExtract object. Using the
1549same example:
1550
1551 $te = new HTML::TableExtract
1552 (
1553 headers => [qw(Summary Region)],
1554 chain => [
1555 { headers => [qw(Part Qty Cost)] }
1556 ],
1557 elastic => 0,
1558 );
1559
1560In this case, the inner table (Part, Qty, Cost) must be B<immediately>
1561contained within the outer table (Summary, Region) in order for the
1562match to take place. This is equivalent to specifying a depth of 0 for
1563each link in the chain; if you only want particular links to be
1564inelastic, then simply set their depths to 0.
1565
1566By default, only tables that match at the end of the chains are
1567retained. The intermediate matches along the chain are referred to as
1568I<waypoints>, and are not extracted by default. A waypoint may be
1569retained, however, by specifiying the I<keep> parameter in that link
1570of the chain. This parameter may be specified at the top level as well
1571if you want to keep tables that match the first set of constraints in
1572the object. If you want to keep all tables that match along the chain,
1573the specify the I<keepall> parameter at the top level.
1574
1575Are chains overkill? Probably. In reality, nested HTML tables tend not
1576to be very deep, so there will usually not be much need for lots of
1577links in a chain. Theoretically, however, chains offer precise
1578targeting of tables relative to one another, no matter how deeply
1579nested they are.
1580
1581=head2 Pop Quiz
1582
1583What happens with the following table extraction?
1584
1585 $te = new HTML::TableExtract(
1586 chain => [ { depth => 0 } ],
1587 );
1588
1589Answer: All tables that are contained in another table are extracted
1590from the document. In this case, there were no top-level constraints
1591specified, which if you recall means that B<all> tables match the
1592first set of constraints (or non-constraints, in this case!). A depth
1593of 0 in the next link of the chain means that the matching table must
1594be immediately contained within the table from a prior match.
1595
1596The following is equivalent:
1597
1598 $te = new HTML::TableExtract(
1599 depth => 1,
1600 subtables => 1,
1601 )
1602
1603The I<subtables> parameter tells TableExtract to scoop up all tables
1604contained within the matching tables. In conjunction with a depth of
16051, this has the affect of discarding all top-level tables in the
1606document, which is exactly what occurred in the prior example.
1607
1608=head2 Advice
1609
1610The main point of this module was to provide a flexible method of
1611extracting tabular information from HTML documents without relying to
1612heavily on the document layout. For that reason, I suggest using
1613I<Headers> whenever possible -- that way, you are anchoring your
1614extraction on what the document is trying to communicate rather than
1615some feature of the HTML comprising the document (other than the fact
1616that the data is contained in a table).
1617
1618HTML::TableExtract is a subclass of HTML::Parser, and as such inherits
1619all of its basic methods. In particular, C<start()>, C<end()>, and
1620C<text()> are utilized. Feel free to override them, but if you do not
1621eventually invoke them in the SUPER class with some content, results
1622are not guaranteed.
1623
1624=head1 METHODS
1625
1626The following are the top-level methods of the HTML::TableExtract
1627object. Tables that have matched a query are actually returned as
1628separate objects of type HTML::TableExtract::TableState. These table
1629state objects have their own methods, documented further below. There
1630are some top-level methods that are present for convenience and
1631backwards compatibility that are nothing more than front-ends for
1632equivalent table state methods.
1633
1634=over
1635
1636=head2 Constructor
1637
1638=item new()
1639
1640Return a new HTML::TableExtract object. Valid attributes are:
1641
1642=over
1643
1644=item headers
1645
1646Passed as an array reference, headers specify strings of interest at
1647the top of columns within targeted tables. These header strings will
1648eventually be passed through a non-anchored, case-insensitive regular
1649expression, so regexp special characters are allowed. The table row
1650containing the headers is B<not> returned. Columns that are not
1651beneath one of the provided headers will be ignored. Columns will, by
1652default, be rearranged into the same order as the headers you provide
1653(see the I<automap> parameter for more information). Additionally, by
1654default columns are considered what you would see visually beneath
1655that header when the table is rendered in a browser. See the
1656I<gridmap> parameter for more information. HTML within a header is
1657stripped before the match is attempted, unless the B<keep_html>
1658parameter was specified.
1659
1660=item depth
1661
1662Specify how embedded in other tables your tables of interest should
1663be. Top-level tables in the HTML document have a depth of 0, tables
1664within top-level tables have a depth of 1, and so on.
1665
1666=item count
1667
1668Specify which table within each depth you are interested in, beginning
1669with 0.
1670
1671=item chain
1672
1673List of additional constraints to be matched sequentially from the top
1674level constraints. This is a reference to an array of hash
1675references. Each hash is a link in the chain, and can be specified in
1676terms of I<depth>, I<count>, and I<headers>. Further modifiers include
1677I<keep>, which means to retain the table if it would normally be
1678dropped as a waypoint.
1679
1680=item automap
1681
1682Automatically applies the ordering reported by column_map() to the
1683rows returned by rows(). This only makes a difference if you have
1684specified I<Headers> and they turn out to be in a different order in
1685the table than what you specified. Automap will rearrange the columns
1686in the same order as the headers appear. To get the original ordering,
1687you will need to take another slice of each row using
1688column_map(). I<automap> is enabled by default.
1689
1690=item gridmap
1691
1692Controls whether the table contents are returned as a grid or a
1693tree. ROWSPAN and COLSPAN issues are compensated for, and columns
1694really are columns. Empty phantom cells are created where they would
1695have been obscured by ROWSPAN or COLSPAN settings. This really becomes
1696an issue when extracting columns beneath headers. Enabled by default.
1697
1698=item keepall
1699
1700Keep all tables that matched along a chain, including tables matched
1701by top level contraints. By default, waypoints are dropped and only
1702the matches at the end of the chain are retained. To retain a
1703particular waypoint along a chain, use the I<keep> parameter in that
1704link.
1705
1706=item elastic
1707
1708When set to 0, all links in chains will be treated as though they had
1709a depth of 0 specified, which means there can be no intervening
1710unmatched tables between matches on links.
1711
1712=item subtables
1713
1714Extract all tables within matched tables.
1715
1716=item decode
1717
1718Automatically decode retrieved text with
1719HTML::Entities::decode_entities(). Enabled by default.
1720
1721=item br_translate
1722
1723Translate <br> tags into newlines. Sometimes the remaining text can be
1724hard to parse if the <br> tag is simply dropped. Enabled by default.
1725Has no effect if I<keep_html> is enabled.
1726
1727=item keep_html
1728
1729Return the raw HTML contained in the cell, rather than just the
1730visible text. Embedded tables are B<not> retained in the HTML
1731extracted from a cell. Patterns for header matches must take into
1732account HTML in the string if this option is enabled.
1733
1734=item debug
1735
1736Prints some debugging information to STDOUT, more for higher values.
1737
1738=back
1739
1740=head2 Regular Methods
1741
1742=item depths()
1743
1744Returns all depths that contained matched tables in the document.
1745
1746=item counts($depth)
1747
1748For a particular depth, returns all counts that contained matched
1749tables.
1750
1751=item table_state($depth, $count)
1752
1753For a particular depth and count, return the table state object for
1754the table found, if any.
1755
1756=item table_states()
1757
1758Return table state objects for all tables that matched.
1759
1760=item first_table_state_found()
1761
1762Return the table state object for the first table matched in the
1763document.
1764
1765=head2 TABLE STATE METHODS
1766
1767The following methods are invoked from an
1768HTML::TableExtract::TableState object, such as those returned from the
1769C<table_states()> method.
1770
1771=item rows()
1772
1773Return all rows within a matched table. Each row returned is a
1774reference to an array containing the text of each cell.
1775
1776=item depth()
1777
1778Return the (absolute) depth at which this table was found.
1779
1780=item count()
1781
1782Return the count for this table within the depth it was found.
1783
1784=item coords()
1785
1786Return depth and count in a list.
1787
1788=item column_map()
1789
1790Return the order (via indices) in which the provided headers were
1791found. These indices can be used as slices on rows to either order the
1792rows in the same order as headers or restore the rows to their natural
1793order, depending on whether the rows have been pre-adjusted using the
1794I<automap> parameter.
1795
1796=item lineage()
1797
1798Returns the path of matched tables that led to matching this
1799table. Lineage only makes sense if chains were used. Tables that were
1800not matched by a link in the chain are not included in lineage. The
1801lineage path is a list of array refs containing depth and count values
1802for each table involved.
1803
1804=head2 Procedural Methods
1805
1806The following top level methods are alternatives to invoking methods
1807in a table state object. If you do not want to deal with table state
1808objects, then these methods are for you. The "tables" they deal in are
1809actually just arrays of arrays, which happen to be the current
1810internal data structure of the table state objects. They are here for
1811backwards compatibility.
1812
1813=item table($depth, $count)
1814
1815Same as C<table_state()>, but returns the internal data structure
1816rather than the table state object.
1817
1818=item tables()
1819
1820Same as C<table_states()>, but returns the data structures rather than
1821the table state objects.
1822
1823=item first_table_found()
1824
1825Same as C<first_table_state_found()>, except returns the data
1826structure for first table that matched.
1827
1828=item table_coords($table)
1829
1830Returns the depth and count for a particular table data structure. See
1831the C<coords()> method provided by table state objects.
1832
1833=item rows()
1834
1835=item rows($table)
1836
1837Return a lsit of the rows for a particular table data structure (first
1838table found by default). See the C<rows()> method provided by table
1839state objects.
1840
1841=item column_map()
1842
1843=item column_map($table)
1844
1845Return the column map for a particular table data structure (first
1846found by default). See the C<column_map()> method provided by table
1847state objects.
1848
1849=back
1850
1851=head1 REQUIRES
1852
1853HTML::Parser(3), HTML::Entities(3)
1854
1855=head1 AUTHOR
1856
1857Matthew P. Sisk, E<lt>F<sisk@mojotoad.com>E<gt>
1858
1859=head1 COPYRIGHT
1860
1861Copyright (c) 2000-2002 Matthew P. Sisk.
1862All rights reserved. All wrongs revenged. This program is free
1863software; you can redistribute it and/or modify it under the
1864same terms as Perl itself.
1865
1866=head1 SEE ALSO
1867
1868HTML::Parser(3), perl(1).
1869
1870=cut
1871
1872In honor of fragmented markup languages and sugar mining:
1873
1874The Good and The Bad
1875Ted Hawkins (1936-1994)
1876
1877Living is good
1878 when you have someone to share it with
1879Laughter is bad
1880 when there is no one there to share it with
1881Talking is sad
1882 if you've got no one to talk to
1883Dying is good
1884 when the one you love grows tired of you
1885
1886Sugar is no good
1887 once it's cast among the white sand
1888What the point
1889 in pulling the gray hairs from among the black strands
1890When you're old
1891 you shouldn't walk in the fast lane
1892Oh ain't it useless
1893 to keep trying to draw true love from that man
1894
1895He'll hurt you,
1896 Yes just for the sake of hurting you
1897and he'll hate you
1898 if you try to love him just the same
1899He'll use you
1900 and everything you have to offer him
1901On your way girl
1902 Get out and find you someone new