Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | .\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "HTML::TableExtract 3" | |
132 | .TH HTML::TableExtract 3 "2002-04-04" "perl v5.8.0" "User Contributed Perl Documentation" | |
133 | .SH "NAME" | |
134 | HTML::TableExtract \- Perl extension for extracting the text contained in tables within an HTML document. | |
135 | .SH "SYNOPSIS" | |
136 | .IX Header "SYNOPSIS" | |
137 | .Vb 3 | |
138 | \& # Matched tables are returned as "table state" objects; tables can be | |
139 | \& # matched using column headers, depth, count within a depth, or some | |
140 | \& # combination of the three. | |
141 | .Ve | |
142 | .PP | |
143 | .Vb 5 | |
144 | \& # Using column header information. Assume an HTML document with | |
145 | \& # tables that have "Date", "Price", and "Cost" somewhere in a | |
146 | \& # row. The columns beneath those headings are what you want to | |
147 | \& # extract. They will be returned in the same order as you specified | |
148 | \& # the headers since 'automap' is enabled by default. | |
149 | .Ve | |
150 | .PP | |
151 | .Vb 3 | |
152 | \& use HTML::TableExtract; | |
153 | \& $te = new HTML::TableExtract( headers => [qw(Date Price Cost)] ); | |
154 | \& $te->parse($html_string); | |
155 | .Ve | |
156 | .PP | |
157 | .Vb 7 | |
158 | \& # Examine all matching tables | |
159 | \& foreach $ts ($te->table_states) { | |
160 | \& print "Table (", join(',', $ts->coords), "):\en"; | |
161 | \& foreach $row ($ts->rows) { | |
162 | \& print join(',', @$row), "\en"; | |
163 | \& } | |
164 | \& } | |
165 | .Ve | |
166 | .PP | |
167 | .Vb 7 | |
168 | \& # Old style, using top level methods rather than table state objects. | |
169 | \& foreach $table ($te->tables) { | |
170 | \& print "Table (", join(',', $te->table_coords($table)), "):\en"; | |
171 | \& foreach $row ($te->rows($table)) { | |
172 | \& print join(',', @$row), "\en"; | |
173 | \& } | |
174 | \& } | |
175 | .Ve | |
176 | .PP | |
177 | .Vb 5 | |
178 | \& # Shorthand...top level rows() method assumes the first table found | |
179 | \& # in the document if no arguments are supplied. | |
180 | \& foreach $row ($te->rows) { | |
181 | \& print join(',', @$row), "\en"; | |
182 | \& } | |
183 | .Ve | |
184 | .PP | |
185 | .Vb 7 | |
186 | \& # Using depth and count information. Every table in the document has | |
187 | \& # a unique depth and count tuple, so when both are specified it is a | |
188 | \& # unique table. Depth and count both begin with 0, so in this case we | |
189 | \& # are looking for a table (depth 2) within a table (depth 1) within a | |
190 | \& # table (depth 0, which is the top level HTML document). In addition, | |
191 | \& # it must be the third (count 2) such instance of a table at that | |
192 | \& # depth. | |
193 | .Ve | |
194 | .PP | |
195 | .Vb 8 | |
196 | \& $te = new HTML::TableExtract( depth => 2, count => 2 ); | |
197 | \& $te->parse($html_string); | |
198 | \& foreach $ts ($te->table_states) { | |
199 | \& print "Table found at ", join(',', $ts->coords), ":\en"; | |
200 | \& foreach $row ($ts->rows) { | |
201 | \& print " ", join(',', @$row), "\en"; | |
202 | \& } | |
203 | \& } | |
204 | .Ve | |
205 | .SH "DESCRIPTION" | |
206 | .IX Header "DESCRIPTION" | |
207 | HTML::TableExtract is a subclass of HTML::Parser that serves to | |
208 | extract the textual information from tables of interest contained | |
209 | within an \s-1HTML\s0 document. The text from each extracted table is stored | |
210 | in tabe state objects which hold the information as an array of arrays | |
211 | that represent the rows and cells of that table. | |
212 | .PP | |
213 | There are three constraints available to specify which tables you | |
214 | would like to extract from a document: \fIHeaders\fR, \fIDepth\fR, and | |
215 | \&\fICount\fR. | |
216 | .PP | |
217 | \&\fIHeaders\fR, the most flexible and adaptive of the techniques, involves | |
218 | specifying text in an array that you expect to appear above the data | |
219 | in the tables of interest. Once all headers have been located in a row | |
220 | of that table, all further cells beneath the columns that matched your | |
221 | headers are extracted. All other columns are ignored: think of it as | |
222 | vertical slices through a table. In addition, TableExtract | |
223 | automatically rearranges each row in the same order as the headers you | |
224 | provided. If you would like to disable this, set \fIautomap\fR to 0 | |
225 | during object creation, and instead rely on the \fIcolumn_map()\fR method to | |
226 | find out the order in which the headers were found. Furthermore, | |
227 | TableExtract will automatically compensate for cell span issues so | |
228 | that columns are really the same columns as you would visually see in | |
229 | a browser. This behavior can be disabled by setting the \fIgridmap\fR | |
230 | parameter to 0. \s-1HTML\s0 is stripped from the entire textual content of a | |
231 | cell before header matches are attempted \*(-- unless the \fIkeep_html\fR | |
232 | parameter was enabled. | |
233 | .PP | |
234 | \&\fIDepth\fR and \fICount\fR are more specific ways to specify tables in | |
235 | relation to one another. \fIDepth\fR represents how deeply a table | |
236 | resides in other tables. The depth of a top-level table in the | |
237 | document is 0. A table within a top-level table has a depth of 1, and | |
238 | so on. Each depth can be thought of as a layer; tables sharing the | |
239 | same depth are on the same layer. Within each of these layers, | |
240 | \&\fICount\fR represents the order in which a table was seen at that depth, | |
241 | starting with 0. Providing both a \fIdepth\fR and a \fIcount\fR will | |
242 | uniquely specify a table within a document. | |
243 | .PP | |
244 | Each of the \fIHeaders\fR, \fIDepth\fR, and \fICount\fR specifications are | |
245 | cumulative in their effect on the overall extraction. For instance, if | |
246 | you specify only a \fIDepth\fR, then you get all tables at that depth | |
247 | (note that these could very well reside in separate higher-level | |
248 | tables throughout the document since depth extends across tables). If | |
249 | you specify only a \fICount\fR, then the tables at that \fICount\fR from all | |
250 | depths are returned (i.e., the \fIn\fRth occurrence of a table at each | |
251 | depth). If you only specify \fIHeaders\fR, then you get all tables in the | |
252 | document containing those column headers. If you have specified | |
253 | multiple constraints of \fIHeaders\fR, \fIDepth\fR, and \fICount\fR, then each | |
254 | constraint has veto power over whether a particular table is | |
255 | extracted. | |
256 | .PP | |
257 | If no \fIHeaders\fR, \fIDepth\fR, or \fICount\fR are specified, then all | |
258 | tables match. | |
259 | .PP | |
260 | Text that is gathered from the tables is decoded with HTML::Entities | |
261 | by default; this can be disabled by setting the \fIdecode\fR parameter to | |
262 | 0. | |
263 | .Sh "Chains" | |
264 | .IX Subsection "Chains" | |
265 | Make sure you fully understand the notions of \fIdepth\fR and \fIcount\fR | |
266 | before proceeding, because it is about to become a bit more involved. | |
267 | .PP | |
268 | Table matches using \fIHeaders\fR, \fIDepth\fR, or \fICount\fR can be chained | |
269 | together in order to further specify tables relative to one | |
270 | another. Links in chains are successively applied to tables within | |
271 | tables. Top level constraints (i.e., \fIheader\fR, \fIdepth\fR, and \fIcount\fR | |
272 | parameters for the TableExtract object) behave as the first link in | |
273 | the chain. Additional links are specified using the \fIchain\fR | |
274 | parameter. Each link in the chain has its own set of constraints. For | |
275 | example: | |
276 | .PP | |
277 | .Vb 8 | |
278 | \& $te = new HTML::TableExtract | |
279 | \& ( | |
280 | \& headers => [qw(Summary Region)], | |
281 | \& chain => [ | |
282 | \& { depth => 0, count => 2 }, | |
283 | \& { headers => [qw(Part Qty Cost)] } | |
284 | \& ], | |
285 | \& ); | |
286 | .Ve | |
287 | .PP | |
288 | The matching process in this case will start with \fBall\fR tables in the | |
289 | document that have \*(L"Summary\*(R" and \*(L"Region\*(R" in their headers. For now, | |
290 | assume that there was only one table that matched these headers. Each | |
291 | table contained within that table will be compared to the first link | |
292 | in the chain. Depth 0 means that a matching table must be immediately | |
293 | contained within the current table; count 2 means that the matching | |
294 | table must also be the third at that depth (counts and depths start at | |
295 | 0). In other words, the next link of the chain will match on the | |
296 | third table immediately contained within our first matched table. Once | |
297 | this link matches, then \fBall\fR further tables beneath that table that | |
298 | have \*(L"Part\*(R", \*(L"Qty\*(R", and \*(L"Cost\*(R" in their headers will match. By | |
299 | default, it is only tables at the end of the chains that are returned | |
300 | to the application, so these tables are returned. | |
301 | .PP | |
302 | Each time a link in a chain matches a table, an additional context for | |
303 | \&\fIdepth\fR and \fIcount\fR is established. It is perhaps easiest to | |
304 | visualize a \fIcontext\fR as a brand-new \s-1HTML\s0 document, with new depths | |
305 | and counts to compare to the remaining links in the chain. The top | |
306 | level \s-1HTML\s0 document is the first context. Each table in the document | |
307 | establishes a new context. \fIDepth\fR in a chain link is relative to the | |
308 | context that the matching table creates (i.e., a link depth of 0 would | |
309 | be a table immediately contained within the table that matched the | |
310 | prior link in the chain). Likewise, that same context keeps track of | |
311 | \&\fIcounts\fR within the new depth scheme for comparison to the remaining | |
312 | links in the chain. Headers still apply if they are present in a link, | |
313 | but they are always independent of context. | |
314 | .PP | |
315 | As it turns out, specifying a depth and count provides a unique | |
316 | address for a table within a context. For non-unique constraints, such | |
317 | as just a depth, or headers, there can be multiple matches for a given | |
318 | link. In these cases the chain \*(L"forks\*(R" and attempts to make further | |
319 | matches within each of these tables. | |
320 | .PP | |
321 | By default, chains are \fIelastic\fR. This means that when a particular | |
322 | link does not match on a table, it is passed down to subtables | |
323 | unchanged. For example: | |
324 | .PP | |
325 | .Vb 7 | |
326 | \& $te = new HTML::TableExtract | |
327 | \& ( | |
328 | \& headers => [qw(Summary Region)], | |
329 | \& chain => [ | |
330 | \& { headers => [qw(Part Qty Cost)] } | |
331 | \& ], | |
332 | \& ); | |
333 | .Ve | |
334 | .PP | |
335 | If there are intervening tables between the two header queries, they | |
336 | will be ignored; this query will extract all tables with \*(L"Part\*(R", | |
337 | \&\*(L"Qty\*(R", and \*(L"Cost\*(R" in the headers that are contained in any table with | |
338 | \&\*(L"Summary\*(R" and \*(L"Region\*(R" in its headers, regardless of how embedded the | |
339 | inner tables are. If you want a chain to be inelastic, you can set the | |
340 | \&\fIelastic\fR parameter to 0 for the whole TableExtract object. Using the | |
341 | same example: | |
342 | .PP | |
343 | .Vb 8 | |
344 | \& $te = new HTML::TableExtract | |
345 | \& ( | |
346 | \& headers => [qw(Summary Region)], | |
347 | \& chain => [ | |
348 | \& { headers => [qw(Part Qty Cost)] } | |
349 | \& ], | |
350 | \& elastic => 0, | |
351 | \& ); | |
352 | .Ve | |
353 | .PP | |
354 | In this case, the inner table (Part, Qty, Cost) must be \fBimmediately\fR | |
355 | contained within the outer table (Summary, Region) in order for the | |
356 | match to take place. This is equivalent to specifying a depth of 0 for | |
357 | each link in the chain; if you only want particular links to be | |
358 | inelastic, then simply set their depths to 0. | |
359 | .PP | |
360 | By default, only tables that match at the end of the chains are | |
361 | retained. The intermediate matches along the chain are referred to as | |
362 | \&\fIwaypoints\fR, and are not extracted by default. A waypoint may be | |
363 | retained, however, by specifiying the \fIkeep\fR parameter in that link | |
364 | of the chain. This parameter may be specified at the top level as well | |
365 | if you want to keep tables that match the first set of constraints in | |
366 | the object. If you want to keep all tables that match along the chain, | |
367 | the specify the \fIkeepall\fR parameter at the top level. | |
368 | .PP | |
369 | Are chains overkill? Probably. In reality, nested \s-1HTML\s0 tables tend not | |
370 | to be very deep, so there will usually not be much need for lots of | |
371 | links in a chain. Theoretically, however, chains offer precise | |
372 | targeting of tables relative to one another, no matter how deeply | |
373 | nested they are. | |
374 | .Sh "Pop Quiz" | |
375 | .IX Subsection "Pop Quiz" | |
376 | What happens with the following table extraction? | |
377 | .PP | |
378 | .Vb 3 | |
379 | \& $te = new HTML::TableExtract( | |
380 | \& chain => [ { depth => 0 } ], | |
381 | \& ); | |
382 | .Ve | |
383 | .PP | |
384 | Answer: All tables that are contained in another table are extracted | |
385 | from the document. In this case, there were no top-level constraints | |
386 | specified, which if you recall means that \fBall\fR tables match the | |
387 | first set of constraints (or non\-constraints, in this case!). A depth | |
388 | of 0 in the next link of the chain means that the matching table must | |
389 | be immediately contained within the table from a prior match. | |
390 | .PP | |
391 | The following is equivalent: | |
392 | .PP | |
393 | .Vb 4 | |
394 | \& $te = new HTML::TableExtract( | |
395 | \& depth => 1, | |
396 | \& subtables => 1, | |
397 | \& ) | |
398 | .Ve | |
399 | .PP | |
400 | The \fIsubtables\fR parameter tells TableExtract to scoop up all tables | |
401 | contained within the matching tables. In conjunction with a depth of | |
402 | 1, this has the affect of discarding all top-level tables in the | |
403 | document, which is exactly what occurred in the prior example. | |
404 | .Sh "Advice" | |
405 | .IX Subsection "Advice" | |
406 | The main point of this module was to provide a flexible method of | |
407 | extracting tabular information from \s-1HTML\s0 documents without relying to | |
408 | heavily on the document layout. For that reason, I suggest using | |
409 | \&\fIHeaders\fR whenever possible \*(-- that way, you are anchoring your | |
410 | extraction on what the document is trying to communicate rather than | |
411 | some feature of the \s-1HTML\s0 comprising the document (other than the fact | |
412 | that the data is contained in a table). | |
413 | .PP | |
414 | HTML::TableExtract is a subclass of HTML::Parser, and as such inherits | |
415 | all of its basic methods. In particular, \f(CW\*(C`start()\*(C'\fR, \f(CW\*(C`end()\*(C'\fR, and | |
416 | \&\f(CW\*(C`text()\*(C'\fR are utilized. Feel free to override them, but if you do not | |
417 | eventually invoke them in the \s-1SUPER\s0 class with some content, results | |
418 | are not guaranteed. | |
419 | .SH "METHODS" | |
420 | .IX Header "METHODS" | |
421 | The following are the top-level methods of the HTML::TableExtract | |
422 | object. Tables that have matched a query are actually returned as | |
423 | separate objects of type HTML::TableExtract::TableState. These table | |
424 | state objects have their own methods, documented further below. There | |
425 | are some top-level methods that are present for convenience and | |
426 | backwards compatibility that are nothing more than front-ends for | |
427 | equivalent table state methods. | |
428 | .Sh "Constructor" | |
429 | .IX Subsection "Constructor" | |
430 | .IP "\fInew()\fR" 4 | |
431 | .IX Item "new()" | |
432 | Return a new HTML::TableExtract object. Valid attributes are: | |
433 | .RS 4 | |
434 | .IP "headers" 4 | |
435 | .IX Item "headers" | |
436 | Passed as an array reference, headers specify strings of interest at | |
437 | the top of columns within targeted tables. These header strings will | |
438 | eventually be passed through a non\-anchored, case-insensitive regular | |
439 | expression, so regexp special characters are allowed. The table row | |
440 | containing the headers is \fBnot\fR returned. Columns that are not | |
441 | beneath one of the provided headers will be ignored. Columns will, by | |
442 | default, be rearranged into the same order as the headers you provide | |
443 | (see the \fIautomap\fR parameter for more information). Additionally, by | |
444 | default columns are considered what you would see visually beneath | |
445 | that header when the table is rendered in a browser. See the | |
446 | \&\fIgridmap\fR parameter for more information. \s-1HTML\s0 within a header is | |
447 | stripped before the match is attempted, unless the \fBkeep_html\fR | |
448 | parameter was specified. | |
449 | .IP "depth" 4 | |
450 | .IX Item "depth" | |
451 | Specify how embedded in other tables your tables of interest should | |
452 | be. Top-level tables in the \s-1HTML\s0 document have a depth of 0, tables | |
453 | within top-level tables have a depth of 1, and so on. | |
454 | .IP "count" 4 | |
455 | .IX Item "count" | |
456 | Specify which table within each depth you are interested in, beginning | |
457 | with 0. | |
458 | .IP "chain" 4 | |
459 | .IX Item "chain" | |
460 | List of additional constraints to be matched sequentially from the top | |
461 | level constraints. This is a reference to an array of hash | |
462 | references. Each hash is a link in the chain, and can be specified in | |
463 | terms of \fIdepth\fR, \fIcount\fR, and \fIheaders\fR. Further modifiers include | |
464 | \&\fIkeep\fR, which means to retain the table if it would normally be | |
465 | dropped as a waypoint. | |
466 | .IP "automap" 4 | |
467 | .IX Item "automap" | |
468 | Automatically applies the ordering reported by \fIcolumn_map()\fR to the | |
469 | rows returned by \fIrows()\fR. This only makes a difference if you have | |
470 | specified \fIHeaders\fR and they turn out to be in a different order in | |
471 | the table than what you specified. Automap will rearrange the columns | |
472 | in the same order as the headers appear. To get the original ordering, | |
473 | you will need to take another slice of each row using | |
474 | \&\fIcolumn_map()\fR. \fIautomap\fR is enabled by default. | |
475 | .IP "gridmap" 4 | |
476 | .IX Item "gridmap" | |
477 | Controls whether the table contents are returned as a grid or a | |
478 | tree. \s-1ROWSPAN\s0 and \s-1COLSPAN\s0 issues are compensated for, and columns | |
479 | really are columns. Empty phantom cells are created where they would | |
480 | have been obscured by \s-1ROWSPAN\s0 or \s-1COLSPAN\s0 settings. This really becomes | |
481 | an issue when extracting columns beneath headers. Enabled by default. | |
482 | .IP "keepall" 4 | |
483 | .IX Item "keepall" | |
484 | Keep all tables that matched along a chain, including tables matched | |
485 | by top level contraints. By default, waypoints are dropped and only | |
486 | the matches at the end of the chain are retained. To retain a | |
487 | particular waypoint along a chain, use the \fIkeep\fR parameter in that | |
488 | link. | |
489 | .IP "elastic" 4 | |
490 | .IX Item "elastic" | |
491 | When set to 0, all links in chains will be treated as though they had | |
492 | a depth of 0 specified, which means there can be no intervening | |
493 | unmatched tables between matches on links. | |
494 | .IP "subtables" 4 | |
495 | .IX Item "subtables" | |
496 | Extract all tables within matched tables. | |
497 | .IP "decode" 4 | |
498 | .IX Item "decode" | |
499 | Automatically decode retrieved text with | |
500 | \&\fIHTML::Entities::decode_entities()\fR. Enabled by default. | |
501 | .IP "br_translate" 4 | |
502 | .IX Item "br_translate" | |
503 | Translate <br> tags into newlines. Sometimes the remaining text can be | |
504 | hard to parse if the <br> tag is simply dropped. Enabled by default. | |
505 | Has no effect if \fIkeep_html\fR is enabled. | |
506 | .IP "keep_html" 4 | |
507 | .IX Item "keep_html" | |
508 | Return the raw \s-1HTML\s0 contained in the cell, rather than just the | |
509 | visible text. Embedded tables are \fBnot\fR retained in the \s-1HTML\s0 | |
510 | extracted from a cell. Patterns for header matches must take into | |
511 | account \s-1HTML\s0 in the string if this option is enabled. | |
512 | .IP "debug" 4 | |
513 | .IX Item "debug" | |
514 | Prints some debugging information to \s-1STDOUT\s0, more for higher values. | |
515 | .RE | |
516 | .RS 4 | |
517 | .Sh "Regular Methods" | |
518 | .IX Subsection "Regular Methods" | |
519 | .RE | |
520 | .IP "\fIdepths()\fR" 4 | |
521 | .IX Item "depths()" | |
522 | Returns all depths that contained matched tables in the document. | |
523 | .IP "counts($depth)" 4 | |
524 | .IX Item "counts($depth)" | |
525 | For a particular depth, returns all counts that contained matched | |
526 | tables. | |
527 | .ie n .IP "table_state($depth, $count)" 4 | |
528 | .el .IP "table_state($depth, \f(CW$count\fR)" 4 | |
529 | .IX Item "table_state($depth, $count)" | |
530 | For a particular depth and count, return the table state object for | |
531 | the table found, if any. | |
532 | .IP "\fItable_states()\fR" 4 | |
533 | .IX Item "table_states()" | |
534 | Return table state objects for all tables that matched. | |
535 | .IP "\fIfirst_table_state_found()\fR" 4 | |
536 | .IX Item "first_table_state_found()" | |
537 | Return the table state object for the first table matched in the | |
538 | document. | |
539 | .Sh "\s-1TABLE\s0 \s-1STATE\s0 \s-1METHODS\s0" | |
540 | .IX Subsection "TABLE STATE METHODS" | |
541 | The following methods are invoked from an | |
542 | HTML::TableExtract::TableState object, such as those returned from the | |
543 | \&\f(CW\*(C`table_states()\*(C'\fR method. | |
544 | .IP "\fIrows()\fR" 4 | |
545 | .IX Item "rows()" | |
546 | Return all rows within a matched table. Each row returned is a | |
547 | reference to an array containing the text of each cell. | |
548 | .IP "\fIdepth()\fR" 4 | |
549 | .IX Item "depth()" | |
550 | Return the (absolute) depth at which this table was found. | |
551 | .IP "\fIcount()\fR" 4 | |
552 | .IX Item "count()" | |
553 | Return the count for this table within the depth it was found. | |
554 | .IP "\fIcoords()\fR" 4 | |
555 | .IX Item "coords()" | |
556 | Return depth and count in a list. | |
557 | .IP "\fIcolumn_map()\fR" 4 | |
558 | .IX Item "column_map()" | |
559 | Return the order (via indices) in which the provided headers were | |
560 | found. These indices can be used as slices on rows to either order the | |
561 | rows in the same order as headers or restore the rows to their natural | |
562 | order, depending on whether the rows have been pre-adjusted using the | |
563 | \&\fIautomap\fR parameter. | |
564 | .IP "\fIlineage()\fR" 4 | |
565 | .IX Item "lineage()" | |
566 | Returns the path of matched tables that led to matching this | |
567 | table. Lineage only makes sense if chains were used. Tables that were | |
568 | not matched by a link in the chain are not included in lineage. The | |
569 | lineage path is a list of array refs containing depth and count values | |
570 | for each table involved. | |
571 | .Sh "Procedural Methods" | |
572 | .IX Subsection "Procedural Methods" | |
573 | The following top level methods are alternatives to invoking methods | |
574 | in a table state object. If you do not want to deal with table state | |
575 | objects, then these methods are for you. The \*(L"tables\*(R" they deal in are | |
576 | actually just arrays of arrays, which happen to be the current | |
577 | internal data structure of the table state objects. They are here for | |
578 | backwards compatibility. | |
579 | .ie n .IP "table($depth, $count)" 4 | |
580 | .el .IP "table($depth, \f(CW$count\fR)" 4 | |
581 | .IX Item "table($depth, $count)" | |
582 | Same as \f(CW\*(C`table_state()\*(C'\fR, but returns the internal data structure | |
583 | rather than the table state object. | |
584 | .IP "\fItables()\fR" 4 | |
585 | .IX Item "tables()" | |
586 | Same as \f(CW\*(C`table_states()\*(C'\fR, but returns the data structures rather than | |
587 | the table state objects. | |
588 | .IP "\fIfirst_table_found()\fR" 4 | |
589 | .IX Item "first_table_found()" | |
590 | Same as \f(CW\*(C`first_table_state_found()\*(C'\fR, except returns the data | |
591 | structure for first table that matched. | |
592 | .IP "table_coords($table)" 4 | |
593 | .IX Item "table_coords($table)" | |
594 | Returns the depth and count for a particular table data structure. See | |
595 | the \f(CW\*(C`coords()\*(C'\fR method provided by table state objects. | |
596 | .IP "\fIrows()\fR" 4 | |
597 | .IX Item "rows()" | |
598 | .PD 0 | |
599 | .IP "rows($table)" 4 | |
600 | .IX Item "rows($table)" | |
601 | .PD | |
602 | Return a lsit of the rows for a particular table data structure (first | |
603 | table found by default). See the \f(CW\*(C`rows()\*(C'\fR method provided by table | |
604 | state objects. | |
605 | .IP "\fIcolumn_map()\fR" 4 | |
606 | .IX Item "column_map()" | |
607 | .PD 0 | |
608 | .IP "column_map($table)" 4 | |
609 | .IX Item "column_map($table)" | |
610 | .PD | |
611 | Return the column map for a particular table data structure (first | |
612 | found by default). See the \f(CW\*(C`column_map()\*(C'\fR method provided by table | |
613 | state objects. | |
614 | .SH "REQUIRES" | |
615 | .IX Header "REQUIRES" | |
616 | \&\fIHTML::Parser\fR\|(3), \fIHTML::Entities\fR\|(3) | |
617 | .SH "AUTHOR" | |
618 | .IX Header "AUTHOR" | |
619 | Matthew P. Sisk, <\fIsisk@mojotoad.com\fR> | |
620 | .SH "COPYRIGHT" | |
621 | .IX Header "COPYRIGHT" | |
622 | Copyright (c) 2000\-2002 Matthew P. Sisk. | |
623 | All rights reserved. All wrongs revenged. This program is free | |
624 | software; you can redistribute it and/or modify it under the | |
625 | same terms as Perl itself. | |
626 | .SH "SEE ALSO" | |
627 | .IX Header "SEE ALSO" | |
628 | \&\fIHTML::Parser\fR\|(3), \fIperl\fR\|(1). |