Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | |
2 | require 5; | |
3 | package HTML::Tagset; # Time-stamp: "2000-10-20 19:35:06 MDT" | |
4 | use strict; | |
5 | use vars qw( | |
6 | $VERSION | |
7 | %emptyElement %optionalEndTag %linkElements %boolean_attr | |
8 | %isHeadElement %isBodyElement %isPhraseMarkup | |
9 | %is_Possible_Strict_P_Content | |
10 | %isHeadOrBodyElement | |
11 | %isList %isTableElement %isFormElement | |
12 | %isKnown %canTighten | |
13 | @p_closure_barriers | |
14 | %isCDATA_Parent | |
15 | ); | |
16 | ||
17 | $VERSION = '3.03'; | |
18 | ||
19 | =head1 NAME | |
20 | ||
21 | HTML::Tagset - data tables useful in parsing HTML | |
22 | ||
23 | =head1 SYNOPSIS | |
24 | ||
25 | use HTML::Tagset; | |
26 | # Then use any of the items in the HTML::Tagset package | |
27 | # as need arises | |
28 | ||
29 | =head1 DESCRIPTION | |
30 | ||
31 | This module contains several data tables useful in various kinds of | |
32 | HTML parsing operations. | |
33 | ||
34 | Note that all tag names used are lowercase. | |
35 | ||
36 | In the following documentation, a "hashset" is a hash being used as a | |
37 | set -- the hash conveys that its keys are there, and the actual values | |
38 | associated with the keys are not significant. (But what values are | |
39 | there, are always true.) | |
40 | ||
41 | =over | |
42 | ||
43 | =item hashset %HTML::Tagset::emptyElement | |
44 | ||
45 | This hashset has as values the tag-names (GIs) of elements that cannot | |
46 | have content. (For example, "base", "br", "hr".) So | |
47 | C<$HTML::Tagset::emptyElement{'hr'}> exists and is true. | |
48 | C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true. | |
49 | ||
50 | =cut | |
51 | ||
52 | #========================================================================== | |
53 | ||
54 | %emptyElement = map {; $_ => 1 } qw(base link meta isindex | |
55 | img br hr wbr | |
56 | input area param | |
57 | embed bgsound spacer | |
58 | basefont col frame | |
59 | ~comment ~literal | |
60 | ~declaration ~pi | |
61 | ); | |
62 | # The "~"-initial names are for pseudo-elements used by HTML::Entities | |
63 | # and TreeBuilder | |
64 | ||
65 | #--------------------------------------------------------------------------- | |
66 | ||
67 | =item hashset %HTML::Tagset::optionalEndTag | |
68 | ||
69 | This hashset lists tag-names for elements that can have content, but whose | |
70 | end-tags are generally, "safely", omissible. Example: | |
71 | C<$HTML::Tagset::emptyElement{'li'}> exists and is true. | |
72 | ||
73 | =cut | |
74 | ||
75 | %optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td); | |
76 | ||
77 | #--------------------------------------------------------------------------- | |
78 | ||
79 | =item hash %HTML::Tagset::linkElements | |
80 | ||
81 | Values in this hash are tagnames for elements that might contain | |
82 | links, and the value for each is a reference to an array of the names | |
83 | of attributes whose values can be links. | |
84 | ||
85 | ||
86 | =cut | |
87 | ||
88 | %linkElements = | |
89 | ( | |
90 | 'a' => ['href'], | |
91 | 'applet' => ['archive', 'codebase', 'code'], | |
92 | 'area' => ['href'], | |
93 | 'base' => ['href'], | |
94 | 'bgsound' => ['src'], | |
95 | 'blockquote' => ['cite'], | |
96 | 'body' => ['background'], | |
97 | 'del' => ['cite'], | |
98 | 'embed' => ['pluginspage', 'src'], | |
99 | 'form' => ['action'], | |
100 | 'frame' => ['src', 'longdesc'], | |
101 | 'iframe' => ['src', 'longdesc'], | |
102 | 'ilayer' => ['background'], | |
103 | 'img' => ['src', 'lowsrc', 'longdesc', 'usemap'], | |
104 | 'input' => ['src', 'usemap'], | |
105 | 'ins' => ['cite'], | |
106 | 'isindex' => ['action'], | |
107 | 'head' => ['profile'], | |
108 | 'layer' => ['background', 'src'], | |
109 | 'link' => ['href'], | |
110 | 'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'], | |
111 | 'q' => ['cite'], | |
112 | 'script' => ['src', 'for'], | |
113 | 'table' => ['background'], | |
114 | 'td' => ['background'], | |
115 | 'th' => ['background'], | |
116 | 'tr' => ['background'], | |
117 | 'xmp' => ['href'], | |
118 | ); | |
119 | ||
120 | #--------------------------------------------------------------------------- | |
121 | ||
122 | =item hash %HTML::Tagset::boolean_attr | |
123 | ||
124 | This hash (not hashset) lists what attributes of what elements can be | |
125 | printed without showing the value (for example, the "noshade" attribute | |
126 | of "hr" elements). For elements with only one such attribute, its value | |
127 | is simply that attribute name. For elements with many such attributes, | |
128 | the value is a reference to a hashset containing all such attributes. | |
129 | ||
130 | =cut | |
131 | ||
132 | %boolean_attr = ( | |
133 | # TODO: make these all hashes | |
134 | 'area' => 'nohref', | |
135 | 'dir' => 'compact', | |
136 | 'dl' => 'compact', | |
137 | 'hr' => 'noshade', | |
138 | 'img' => 'ismap', | |
139 | 'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 }, | |
140 | 'menu' => 'compact', | |
141 | 'ol' => 'compact', | |
142 | 'option' => 'selected', | |
143 | 'select' => 'multiple', | |
144 | 'td' => 'nowrap', | |
145 | 'th' => 'nowrap', | |
146 | 'ul' => 'compact', | |
147 | ); | |
148 | ||
149 | #========================================================================== | |
150 | # List of all elements from Extensible HTML version 1.0 Transitional DTD: | |
151 | # | |
152 | # a abbr acronym address applet area b base basefont bdo big | |
153 | # blockquote body br button caption center cite code col colgroup | |
154 | # dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 | |
155 | # head hr html i iframe img input ins isindex kbd label legend li | |
156 | # link map menu meta noframes noscript object ol optgroup option p | |
157 | # param pre q s samp script select small span strike strong style | |
158 | # sub sup table tbody td textarea tfoot th thead title tr tt u ul | |
159 | # var | |
160 | # | |
161 | # Varia from Mozilla source internal table of tags: | |
162 | # Implemented: | |
163 | # xmp listing wbr nobr frame frameset noframes ilayer | |
164 | # layer nolayer spacer embed multicol | |
165 | # But these are unimplemented: | |
166 | # sound?? keygen?? server?? | |
167 | # Also seen here and there: | |
168 | # marquee?? app?? (both unimplemented) | |
169 | #========================================================================== | |
170 | ||
171 | =item hashset %HTML::Tagset::isPhraseMarkup | |
172 | ||
173 | This hashset contains all phrasal-level elements. | |
174 | ||
175 | =cut | |
176 | ||
177 | %isPhraseMarkup = map {; $_ => 1 } qw( | |
178 | span abbr acronym q sub sup | |
179 | cite code em kbd samp strong var dfn strike | |
180 | b i u s tt small big | |
181 | a img br | |
182 | wbr nobr blink | |
183 | font basefont bdo | |
184 | spacer embed noembed | |
185 | ); # had: center, hr, table | |
186 | ||
187 | ||
188 | =item hashset %HTML::Tagset::is_Possible_Strict_P_Content | |
189 | ||
190 | This hashset contains all phrasal-level elements that be content of a | |
191 | P element, for a strict model of HTML. | |
192 | ||
193 | =cut | |
194 | ||
195 | %is_Possible_Strict_P_Content = ( | |
196 | %isPhraseMarkup, | |
197 | %isFormElement, | |
198 | map {; $_ => 1} qw( object script map ) | |
199 | # I've no idea why there's these latter exceptions. | |
200 | # I'm just following the HTML4.01 DTD. | |
201 | ); | |
202 | ||
203 | #from html4 strict: | |
204 | #<!ENTITY % fontstyle "TT | I | B | BIG | SMALL"> | |
205 | # | |
206 | #<!ENTITY % phrase "EM | STRONG | DFN | CODE | | |
207 | # SAMP | KBD | VAR | CITE | ABBR | ACRONYM" > | |
208 | # | |
209 | #<!ENTITY % special | |
210 | # "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO"> | |
211 | # | |
212 | #<!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON"> | |
213 | # | |
214 | #<!-- %inline; covers inline or "text-level" elements --> | |
215 | #<!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;"> | |
216 | ||
217 | =item hashset %HTML::Tagset::isHeadElement | |
218 | ||
219 | This hashset contains all elements that elements that should be | |
220 | present only in the 'head' element of an HTML document. | |
221 | ||
222 | =cut | |
223 | ||
224 | %isHeadElement = map {; $_ => 1 } | |
225 | qw(title base link meta isindex script style object bgsound); | |
226 | ||
227 | =item hashset %HTML::Tagset::isList | |
228 | ||
229 | This hashset contains all elements that can contain "li" elements. | |
230 | ||
231 | =cut | |
232 | ||
233 | %isList = map {; $_ => 1 } qw(ul ol dir menu); | |
234 | ||
235 | =item hashset %HTML::Tagset::isTableElement | |
236 | ||
237 | This hashset contains all elements that are to be found only in/under | |
238 | a "table" element. | |
239 | ||
240 | =cut | |
241 | ||
242 | %isTableElement = map {; $_ => 1 } | |
243 | qw(tr td th thead tbody tfoot caption col colgroup); | |
244 | ||
245 | =item hashset %HTML::Tagset::isFormElement | |
246 | ||
247 | This hashset contains all elements that are to be found only in/under | |
248 | a "form" element. | |
249 | ||
250 | =cut | |
251 | ||
252 | %isFormElement = map {; $_ => 1 } | |
253 | qw(input select option optgroup textarea button label); | |
254 | ||
255 | =item hashset %HTML::Tagset::isBodyMarkup | |
256 | ||
257 | This hashset contains all elements that are to be found only in/under | |
258 | the "body" element of an HTML document. | |
259 | ||
260 | =cut | |
261 | ||
262 | %isBodyElement = map {; $_ => 1 } qw( | |
263 | h1 h2 h3 h4 h5 h6 | |
264 | p div pre plaintext address blockquote | |
265 | xmp listing | |
266 | center | |
267 | ||
268 | multicol | |
269 | iframe ilayer nolayer | |
270 | bgsound | |
271 | ||
272 | hr | |
273 | ol ul dir menu li | |
274 | dl dt dd | |
275 | ins del | |
276 | ||
277 | fieldset legend | |
278 | ||
279 | map area | |
280 | applet param object | |
281 | isindex script noscript | |
282 | table | |
283 | center | |
284 | form | |
285 | ), | |
286 | keys %isFormElement, | |
287 | keys %isPhraseMarkup, # And everything phrasal | |
288 | keys %isTableElement, | |
289 | ; | |
290 | ||
291 | ||
292 | =item hashset %HTML::Tagset::isHeadOrBodyElement | |
293 | ||
294 | This hashset includes all elements that I notice can fall either in | |
295 | the head or in the body. | |
296 | ||
297 | =cut | |
298 | ||
299 | %isHeadOrBodyElement = map {; $_ => 1 } | |
300 | qw(script isindex style object map area param noscript bgsound); | |
301 | # i.e., if we find 'script' in the 'body' or the 'head', don't freak out. | |
302 | ||
303 | ||
304 | =item hashset %HTML::Tagset::isKnown | |
305 | ||
306 | This hashset lists all known HTML elements. | |
307 | ||
308 | =cut | |
309 | ||
310 | %isKnown = (%isHeadElement, %isBodyElement, | |
311 | map{; $_=>1 } | |
312 | qw( head body html | |
313 | frame frameset noframes | |
314 | ~comment ~pi ~directive ~literal | |
315 | )); | |
316 | # that should be all known tags ever ever | |
317 | ||
318 | ||
319 | =item hashset %HTML::Tagset::canTighten | |
320 | ||
321 | This hashset lists elements that might have ignorable whitespace as | |
322 | children or siblings. | |
323 | ||
324 | =cut | |
325 | ||
326 | %canTighten = %isKnown; | |
327 | delete @canTighten{ | |
328 | keys(%isPhraseMarkup), 'input', 'select', | |
329 | 'xmp', 'listing', 'plaintext', 'pre', | |
330 | }; | |
331 | # xmp, listing, plaintext, and pre are untightenable, and | |
332 | # in a really special way. | |
333 | @canTighten{'hr','br'} = (1,1); | |
334 | # exceptional 'phrasal' things that ARE subject to tightening. | |
335 | ||
336 | # The one case where I can think of my tightening rules failing is: | |
337 | # <p>foo bar<center> <em>baz quux</em> ... | |
338 | # ^-- that would get deleted. | |
339 | # But that's pretty gruesome code anyhow. You gets what you pays for. | |
340 | ||
341 | #========================================================================== | |
342 | ||
343 | =item array @HTML::Tagset::p_closure_barriers | |
344 | ||
345 | This array has a meaning that I have only seen a need for in | |
346 | C<HTML::TreeBuilder>, but I include it here on the off chance that someone | |
347 | might find it of use: | |
348 | ||
349 | When we see a "E<lt>pE<gt>" token, we go lookup up the lineage for a p | |
350 | element we might have to minimize. At first sight, we might say that | |
351 | if there's a p anywhere in the lineage of this new p, it should be | |
352 | closed. But that's wrong. Consider this document: | |
353 | ||
354 | <html> | |
355 | <head> | |
356 | <title>foo</title> | |
357 | </head> | |
358 | <body> | |
359 | <p>foo | |
360 | <table> | |
361 | <tr> | |
362 | <td> | |
363 | foo | |
364 | <p>bar | |
365 | </td> | |
366 | </tr> | |
367 | </table> | |
368 | </p> | |
369 | </body> | |
370 | </html> | |
371 | ||
372 | The second p is quite legally inside a much higher p. | |
373 | ||
374 | My formalization of the reason why this is legal, but this: | |
375 | ||
376 | <p>foo<p>bar</p></p> | |
377 | ||
378 | isn't, is that something about the table constitutes a "barrier" to | |
379 | the application of the rule about what p must minimize. | |
380 | ||
381 | So C<@HTML::Tagset::p_closure_barriers> is the list of all such | |
382 | barrier-tags. | |
383 | ||
384 | =cut | |
385 | ||
386 | @p_closure_barriers = qw( | |
387 | li blockquote | |
388 | ul ol menu dir | |
389 | dl dt dd | |
390 | td th tr table caption | |
391 | ); | |
392 | ||
393 | # In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this | |
394 | # monkey business of barriers to minimization! | |
395 | ||
396 | ########################################################################### | |
397 | ||
398 | =item hashset %isCDATA_Parent | |
399 | ||
400 | This hashset includes all elements whose content is CDATA. | |
401 | ||
402 | =cut | |
403 | ||
404 | %isCDATA_Parent = map {; $_ => 1 } | |
405 | qw(script style xmp listing plaintext); | |
406 | ||
407 | # TODO: there's nothing else that takes CDATA children, right? | |
408 | ||
409 | # As the HTML3 DTD (Raggett 1995-04-24) noted: | |
410 | # The XMP, LISTING and PLAINTEXT tags are incompatible with SGML | |
411 | # and derive from very early versions of HTML. They require non- | |
412 | # standard parsers and will cause problems for processing | |
413 | # documents with standard SGML tools. | |
414 | ||
415 | ||
416 | ||
417 | ########################################################################### | |
418 | ||
419 | =back | |
420 | ||
421 | =head1 CAVEATS | |
422 | ||
423 | You may find it useful to alter the behavior of modules (like | |
424 | C<HTML::Element> or C<HTML::TreeBuilder>) that use C<HTML::Tagset>'s | |
425 | data tables by altering the data tables themselves. You are welcome | |
426 | to try, but be careful; and be aware that different modules may or may | |
427 | react differently to the data tables being changed. | |
428 | ||
429 | Note that it may be inappropriate to use these tables for I<producing> | |
430 | HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames | |
431 | for all elements that can appear either in the head or in the body, | |
432 | such as "script". That doesn't mean that I am saying your code that | |
433 | produces HTML should feel free to put script elements in either place! | |
434 | If you are producing programs that spit out HTML, you should be | |
435 | I<intimately> familiar with the DTDs for HTML or XHTML (available at | |
436 | C<http://www.w3.org/>), and you should slavishly obey them, not | |
437 | the data tables in this document. | |
438 | ||
439 | =head1 SEE ALSO | |
440 | ||
441 | L<HTML::Element>, L<HTML::TreeBuilder>, L<HTML::LinkExtor> | |
442 | ||
443 | =head1 COPYRIGHT | |
444 | ||
445 | Copyright 1995-2000 Gisle Aas; copyright 2000 Sean M. Burke. | |
446 | ||
447 | This library is free software; you can redistribute it and/or | |
448 | modify it under the same terms as Perl itself. | |
449 | ||
450 | =head1 AUTHOR | |
451 | ||
452 | Current maintainer: Sean M. Burke, E<lt>sburke@cpan.orgE<gt> | |
453 | ||
454 | Most of the code/data in this module was adapted from code written by | |
455 | Gisle Aas E<lt>gisle@aas.noE<gt> for C<HTML::Element>, | |
456 | C<HTML::TreeBuilder>, and C<HTML::LinkExtor>. | |
457 | ||
458 | =cut | |
459 | ||
460 | 1; |