| 1 | |
| 2 | require 5; |
| 3 | package HTML::Tagset; # Time-stamp: "2000-10-20 19:35:06 MDT" |
| 4 | use strict; |
| 5 | use vars qw( |
| 6 | $VERSION |
| 7 | %emptyElement %optionalEndTag %linkElements %boolean_attr |
| 8 | %isHeadElement %isBodyElement %isPhraseMarkup |
| 9 | %is_Possible_Strict_P_Content |
| 10 | %isHeadOrBodyElement |
| 11 | %isList %isTableElement %isFormElement |
| 12 | %isKnown %canTighten |
| 13 | @p_closure_barriers |
| 14 | %isCDATA_Parent |
| 15 | ); |
| 16 | |
| 17 | $VERSION = '3.03'; |
| 18 | |
| 19 | =head1 NAME |
| 20 | |
| 21 | HTML::Tagset - data tables useful in parsing HTML |
| 22 | |
| 23 | =head1 SYNOPSIS |
| 24 | |
| 25 | use HTML::Tagset; |
| 26 | # Then use any of the items in the HTML::Tagset package |
| 27 | # as need arises |
| 28 | |
| 29 | =head1 DESCRIPTION |
| 30 | |
| 31 | This module contains several data tables useful in various kinds of |
| 32 | HTML parsing operations. |
| 33 | |
| 34 | Note that all tag names used are lowercase. |
| 35 | |
| 36 | In the following documentation, a "hashset" is a hash being used as a |
| 37 | set -- the hash conveys that its keys are there, and the actual values |
| 38 | associated with the keys are not significant. (But what values are |
| 39 | there, are always true.) |
| 40 | |
| 41 | =over |
| 42 | |
| 43 | =item hashset %HTML::Tagset::emptyElement |
| 44 | |
| 45 | This hashset has as values the tag-names (GIs) of elements that cannot |
| 46 | have content. (For example, "base", "br", "hr".) So |
| 47 | C<$HTML::Tagset::emptyElement{'hr'}> exists and is true. |
| 48 | C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true. |
| 49 | |
| 50 | =cut |
| 51 | |
| 52 | #========================================================================== |
| 53 | |
| 54 | %emptyElement = map {; $_ => 1 } qw(base link meta isindex |
| 55 | img br hr wbr |
| 56 | input area param |
| 57 | embed bgsound spacer |
| 58 | basefont col frame |
| 59 | ~comment ~literal |
| 60 | ~declaration ~pi |
| 61 | ); |
| 62 | # The "~"-initial names are for pseudo-elements used by HTML::Entities |
| 63 | # and TreeBuilder |
| 64 | |
| 65 | #--------------------------------------------------------------------------- |
| 66 | |
| 67 | =item hashset %HTML::Tagset::optionalEndTag |
| 68 | |
| 69 | This hashset lists tag-names for elements that can have content, but whose |
| 70 | end-tags are generally, "safely", omissible. Example: |
| 71 | C<$HTML::Tagset::emptyElement{'li'}> exists and is true. |
| 72 | |
| 73 | =cut |
| 74 | |
| 75 | %optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td); |
| 76 | |
| 77 | #--------------------------------------------------------------------------- |
| 78 | |
| 79 | =item hash %HTML::Tagset::linkElements |
| 80 | |
| 81 | Values in this hash are tagnames for elements that might contain |
| 82 | links, and the value for each is a reference to an array of the names |
| 83 | of attributes whose values can be links. |
| 84 | |
| 85 | |
| 86 | =cut |
| 87 | |
| 88 | %linkElements = |
| 89 | ( |
| 90 | 'a' => ['href'], |
| 91 | 'applet' => ['archive', 'codebase', 'code'], |
| 92 | 'area' => ['href'], |
| 93 | 'base' => ['href'], |
| 94 | 'bgsound' => ['src'], |
| 95 | 'blockquote' => ['cite'], |
| 96 | 'body' => ['background'], |
| 97 | 'del' => ['cite'], |
| 98 | 'embed' => ['pluginspage', 'src'], |
| 99 | 'form' => ['action'], |
| 100 | 'frame' => ['src', 'longdesc'], |
| 101 | 'iframe' => ['src', 'longdesc'], |
| 102 | 'ilayer' => ['background'], |
| 103 | 'img' => ['src', 'lowsrc', 'longdesc', 'usemap'], |
| 104 | 'input' => ['src', 'usemap'], |
| 105 | 'ins' => ['cite'], |
| 106 | 'isindex' => ['action'], |
| 107 | 'head' => ['profile'], |
| 108 | 'layer' => ['background', 'src'], |
| 109 | 'link' => ['href'], |
| 110 | 'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'], |
| 111 | 'q' => ['cite'], |
| 112 | 'script' => ['src', 'for'], |
| 113 | 'table' => ['background'], |
| 114 | 'td' => ['background'], |
| 115 | 'th' => ['background'], |
| 116 | 'tr' => ['background'], |
| 117 | 'xmp' => ['href'], |
| 118 | ); |
| 119 | |
| 120 | #--------------------------------------------------------------------------- |
| 121 | |
| 122 | =item hash %HTML::Tagset::boolean_attr |
| 123 | |
| 124 | This hash (not hashset) lists what attributes of what elements can be |
| 125 | printed without showing the value (for example, the "noshade" attribute |
| 126 | of "hr" elements). For elements with only one such attribute, its value |
| 127 | is simply that attribute name. For elements with many such attributes, |
| 128 | the value is a reference to a hashset containing all such attributes. |
| 129 | |
| 130 | =cut |
| 131 | |
| 132 | %boolean_attr = ( |
| 133 | # TODO: make these all hashes |
| 134 | 'area' => 'nohref', |
| 135 | 'dir' => 'compact', |
| 136 | 'dl' => 'compact', |
| 137 | 'hr' => 'noshade', |
| 138 | 'img' => 'ismap', |
| 139 | 'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 }, |
| 140 | 'menu' => 'compact', |
| 141 | 'ol' => 'compact', |
| 142 | 'option' => 'selected', |
| 143 | 'select' => 'multiple', |
| 144 | 'td' => 'nowrap', |
| 145 | 'th' => 'nowrap', |
| 146 | 'ul' => 'compact', |
| 147 | ); |
| 148 | |
| 149 | #========================================================================== |
| 150 | # List of all elements from Extensible HTML version 1.0 Transitional DTD: |
| 151 | # |
| 152 | # a abbr acronym address applet area b base basefont bdo big |
| 153 | # blockquote body br button caption center cite code col colgroup |
| 154 | # dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 |
| 155 | # head hr html i iframe img input ins isindex kbd label legend li |
| 156 | # link map menu meta noframes noscript object ol optgroup option p |
| 157 | # param pre q s samp script select small span strike strong style |
| 158 | # sub sup table tbody td textarea tfoot th thead title tr tt u ul |
| 159 | # var |
| 160 | # |
| 161 | # Varia from Mozilla source internal table of tags: |
| 162 | # Implemented: |
| 163 | # xmp listing wbr nobr frame frameset noframes ilayer |
| 164 | # layer nolayer spacer embed multicol |
| 165 | # But these are unimplemented: |
| 166 | # sound?? keygen?? server?? |
| 167 | # Also seen here and there: |
| 168 | # marquee?? app?? (both unimplemented) |
| 169 | #========================================================================== |
| 170 | |
| 171 | =item hashset %HTML::Tagset::isPhraseMarkup |
| 172 | |
| 173 | This hashset contains all phrasal-level elements. |
| 174 | |
| 175 | =cut |
| 176 | |
| 177 | %isPhraseMarkup = map {; $_ => 1 } qw( |
| 178 | span abbr acronym q sub sup |
| 179 | cite code em kbd samp strong var dfn strike |
| 180 | b i u s tt small big |
| 181 | a img br |
| 182 | wbr nobr blink |
| 183 | font basefont bdo |
| 184 | spacer embed noembed |
| 185 | ); # had: center, hr, table |
| 186 | |
| 187 | |
| 188 | =item hashset %HTML::Tagset::is_Possible_Strict_P_Content |
| 189 | |
| 190 | This hashset contains all phrasal-level elements that be content of a |
| 191 | P element, for a strict model of HTML. |
| 192 | |
| 193 | =cut |
| 194 | |
| 195 | %is_Possible_Strict_P_Content = ( |
| 196 | %isPhraseMarkup, |
| 197 | %isFormElement, |
| 198 | map {; $_ => 1} qw( object script map ) |
| 199 | # I've no idea why there's these latter exceptions. |
| 200 | # I'm just following the HTML4.01 DTD. |
| 201 | ); |
| 202 | |
| 203 | #from html4 strict: |
| 204 | #<!ENTITY % fontstyle "TT | I | B | BIG | SMALL"> |
| 205 | # |
| 206 | #<!ENTITY % phrase "EM | STRONG | DFN | CODE | |
| 207 | # SAMP | KBD | VAR | CITE | ABBR | ACRONYM" > |
| 208 | # |
| 209 | #<!ENTITY % special |
| 210 | # "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO"> |
| 211 | # |
| 212 | #<!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON"> |
| 213 | # |
| 214 | #<!-- %inline; covers inline or "text-level" elements --> |
| 215 | #<!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;"> |
| 216 | |
| 217 | =item hashset %HTML::Tagset::isHeadElement |
| 218 | |
| 219 | This hashset contains all elements that elements that should be |
| 220 | present only in the 'head' element of an HTML document. |
| 221 | |
| 222 | =cut |
| 223 | |
| 224 | %isHeadElement = map {; $_ => 1 } |
| 225 | qw(title base link meta isindex script style object bgsound); |
| 226 | |
| 227 | =item hashset %HTML::Tagset::isList |
| 228 | |
| 229 | This hashset contains all elements that can contain "li" elements. |
| 230 | |
| 231 | =cut |
| 232 | |
| 233 | %isList = map {; $_ => 1 } qw(ul ol dir menu); |
| 234 | |
| 235 | =item hashset %HTML::Tagset::isTableElement |
| 236 | |
| 237 | This hashset contains all elements that are to be found only in/under |
| 238 | a "table" element. |
| 239 | |
| 240 | =cut |
| 241 | |
| 242 | %isTableElement = map {; $_ => 1 } |
| 243 | qw(tr td th thead tbody tfoot caption col colgroup); |
| 244 | |
| 245 | =item hashset %HTML::Tagset::isFormElement |
| 246 | |
| 247 | This hashset contains all elements that are to be found only in/under |
| 248 | a "form" element. |
| 249 | |
| 250 | =cut |
| 251 | |
| 252 | %isFormElement = map {; $_ => 1 } |
| 253 | qw(input select option optgroup textarea button label); |
| 254 | |
| 255 | =item hashset %HTML::Tagset::isBodyMarkup |
| 256 | |
| 257 | This hashset contains all elements that are to be found only in/under |
| 258 | the "body" element of an HTML document. |
| 259 | |
| 260 | =cut |
| 261 | |
| 262 | %isBodyElement = map {; $_ => 1 } qw( |
| 263 | h1 h2 h3 h4 h5 h6 |
| 264 | p div pre plaintext address blockquote |
| 265 | xmp listing |
| 266 | center |
| 267 | |
| 268 | multicol |
| 269 | iframe ilayer nolayer |
| 270 | bgsound |
| 271 | |
| 272 | hr |
| 273 | ol ul dir menu li |
| 274 | dl dt dd |
| 275 | ins del |
| 276 | |
| 277 | fieldset legend |
| 278 | |
| 279 | map area |
| 280 | applet param object |
| 281 | isindex script noscript |
| 282 | table |
| 283 | center |
| 284 | form |
| 285 | ), |
| 286 | keys %isFormElement, |
| 287 | keys %isPhraseMarkup, # And everything phrasal |
| 288 | keys %isTableElement, |
| 289 | ; |
| 290 | |
| 291 | |
| 292 | =item hashset %HTML::Tagset::isHeadOrBodyElement |
| 293 | |
| 294 | This hashset includes all elements that I notice can fall either in |
| 295 | the head or in the body. |
| 296 | |
| 297 | =cut |
| 298 | |
| 299 | %isHeadOrBodyElement = map {; $_ => 1 } |
| 300 | qw(script isindex style object map area param noscript bgsound); |
| 301 | # i.e., if we find 'script' in the 'body' or the 'head', don't freak out. |
| 302 | |
| 303 | |
| 304 | =item hashset %HTML::Tagset::isKnown |
| 305 | |
| 306 | This hashset lists all known HTML elements. |
| 307 | |
| 308 | =cut |
| 309 | |
| 310 | %isKnown = (%isHeadElement, %isBodyElement, |
| 311 | map{; $_=>1 } |
| 312 | qw( head body html |
| 313 | frame frameset noframes |
| 314 | ~comment ~pi ~directive ~literal |
| 315 | )); |
| 316 | # that should be all known tags ever ever |
| 317 | |
| 318 | |
| 319 | =item hashset %HTML::Tagset::canTighten |
| 320 | |
| 321 | This hashset lists elements that might have ignorable whitespace as |
| 322 | children or siblings. |
| 323 | |
| 324 | =cut |
| 325 | |
| 326 | %canTighten = %isKnown; |
| 327 | delete @canTighten{ |
| 328 | keys(%isPhraseMarkup), 'input', 'select', |
| 329 | 'xmp', 'listing', 'plaintext', 'pre', |
| 330 | }; |
| 331 | # xmp, listing, plaintext, and pre are untightenable, and |
| 332 | # in a really special way. |
| 333 | @canTighten{'hr','br'} = (1,1); |
| 334 | # exceptional 'phrasal' things that ARE subject to tightening. |
| 335 | |
| 336 | # The one case where I can think of my tightening rules failing is: |
| 337 | # <p>foo bar<center> <em>baz quux</em> ... |
| 338 | # ^-- that would get deleted. |
| 339 | # But that's pretty gruesome code anyhow. You gets what you pays for. |
| 340 | |
| 341 | #========================================================================== |
| 342 | |
| 343 | =item array @HTML::Tagset::p_closure_barriers |
| 344 | |
| 345 | This array has a meaning that I have only seen a need for in |
| 346 | C<HTML::TreeBuilder>, but I include it here on the off chance that someone |
| 347 | might find it of use: |
| 348 | |
| 349 | When we see a "E<lt>pE<gt>" token, we go lookup up the lineage for a p |
| 350 | element we might have to minimize. At first sight, we might say that |
| 351 | if there's a p anywhere in the lineage of this new p, it should be |
| 352 | closed. But that's wrong. Consider this document: |
| 353 | |
| 354 | <html> |
| 355 | <head> |
| 356 | <title>foo</title> |
| 357 | </head> |
| 358 | <body> |
| 359 | <p>foo |
| 360 | <table> |
| 361 | <tr> |
| 362 | <td> |
| 363 | foo |
| 364 | <p>bar |
| 365 | </td> |
| 366 | </tr> |
| 367 | </table> |
| 368 | </p> |
| 369 | </body> |
| 370 | </html> |
| 371 | |
| 372 | The second p is quite legally inside a much higher p. |
| 373 | |
| 374 | My formalization of the reason why this is legal, but this: |
| 375 | |
| 376 | <p>foo<p>bar</p></p> |
| 377 | |
| 378 | isn't, is that something about the table constitutes a "barrier" to |
| 379 | the application of the rule about what p must minimize. |
| 380 | |
| 381 | So C<@HTML::Tagset::p_closure_barriers> is the list of all such |
| 382 | barrier-tags. |
| 383 | |
| 384 | =cut |
| 385 | |
| 386 | @p_closure_barriers = qw( |
| 387 | li blockquote |
| 388 | ul ol menu dir |
| 389 | dl dt dd |
| 390 | td th tr table caption |
| 391 | ); |
| 392 | |
| 393 | # In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this |
| 394 | # monkey business of barriers to minimization! |
| 395 | |
| 396 | ########################################################################### |
| 397 | |
| 398 | =item hashset %isCDATA_Parent |
| 399 | |
| 400 | This hashset includes all elements whose content is CDATA. |
| 401 | |
| 402 | =cut |
| 403 | |
| 404 | %isCDATA_Parent = map {; $_ => 1 } |
| 405 | qw(script style xmp listing plaintext); |
| 406 | |
| 407 | # TODO: there's nothing else that takes CDATA children, right? |
| 408 | |
| 409 | # As the HTML3 DTD (Raggett 1995-04-24) noted: |
| 410 | # The XMP, LISTING and PLAINTEXT tags are incompatible with SGML |
| 411 | # and derive from very early versions of HTML. They require non- |
| 412 | # standard parsers and will cause problems for processing |
| 413 | # documents with standard SGML tools. |
| 414 | |
| 415 | |
| 416 | |
| 417 | ########################################################################### |
| 418 | |
| 419 | =back |
| 420 | |
| 421 | =head1 CAVEATS |
| 422 | |
| 423 | You may find it useful to alter the behavior of modules (like |
| 424 | C<HTML::Element> or C<HTML::TreeBuilder>) that use C<HTML::Tagset>'s |
| 425 | data tables by altering the data tables themselves. You are welcome |
| 426 | to try, but be careful; and be aware that different modules may or may |
| 427 | react differently to the data tables being changed. |
| 428 | |
| 429 | Note that it may be inappropriate to use these tables for I<producing> |
| 430 | HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames |
| 431 | for all elements that can appear either in the head or in the body, |
| 432 | such as "script". That doesn't mean that I am saying your code that |
| 433 | produces HTML should feel free to put script elements in either place! |
| 434 | If you are producing programs that spit out HTML, you should be |
| 435 | I<intimately> familiar with the DTDs for HTML or XHTML (available at |
| 436 | C<http://www.w3.org/>), and you should slavishly obey them, not |
| 437 | the data tables in this document. |
| 438 | |
| 439 | =head1 SEE ALSO |
| 440 | |
| 441 | L<HTML::Element>, L<HTML::TreeBuilder>, L<HTML::LinkExtor> |
| 442 | |
| 443 | =head1 COPYRIGHT |
| 444 | |
| 445 | Copyright 1995-2000 Gisle Aas; copyright 2000 Sean M. Burke. |
| 446 | |
| 447 | This library is free software; you can redistribute it and/or |
| 448 | modify it under the same terms as Perl itself. |
| 449 | |
| 450 | =head1 AUTHOR |
| 451 | |
| 452 | Current maintainer: Sean M. Burke, E<lt>sburke@cpan.orgE<gt> |
| 453 | |
| 454 | Most of the code/data in this module was adapted from code written by |
| 455 | Gisle Aas E<lt>gisle@aas.noE<gt> for C<HTML::Element>, |
| 456 | C<HTML::TreeBuilder>, and C<HTML::LinkExtor>. |
| 457 | |
| 458 | =cut |
| 459 | |
| 460 | 1; |