Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / site_perl / 5.8.0 / HTML / Tagset.pm
CommitLineData
86530b38
AT
1
2require 5;
3package HTML::Tagset; # Time-stamp: "2000-10-20 19:35:06 MDT"
4use strict;
5use vars qw(
6 $VERSION
7 %emptyElement %optionalEndTag %linkElements %boolean_attr
8 %isHeadElement %isBodyElement %isPhraseMarkup
9 %is_Possible_Strict_P_Content
10 %isHeadOrBodyElement
11 %isList %isTableElement %isFormElement
12 %isKnown %canTighten
13 @p_closure_barriers
14 %isCDATA_Parent
15);
16
17$VERSION = '3.03';
18
19=head1 NAME
20
21HTML::Tagset - data tables useful in parsing HTML
22
23=head1 SYNOPSIS
24
25 use HTML::Tagset;
26 # Then use any of the items in the HTML::Tagset package
27 # as need arises
28
29=head1 DESCRIPTION
30
31This module contains several data tables useful in various kinds of
32HTML parsing operations.
33
34Note that all tag names used are lowercase.
35
36In the following documentation, a "hashset" is a hash being used as a
37set -- the hash conveys that its keys are there, and the actual values
38associated with the keys are not significant. (But what values are
39there, are always true.)
40
41=over
42
43=item hashset %HTML::Tagset::emptyElement
44
45This hashset has as values the tag-names (GIs) of elements that cannot
46have content. (For example, "base", "br", "hr".) So
47C<$HTML::Tagset::emptyElement{'hr'}> exists and is true.
48C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true.
49
50=cut
51
52#==========================================================================
53
54%emptyElement = map {; $_ => 1 } qw(base link meta isindex
55 img br hr wbr
56 input area param
57 embed bgsound spacer
58 basefont col frame
59 ~comment ~literal
60 ~declaration ~pi
61 );
62 # The "~"-initial names are for pseudo-elements used by HTML::Entities
63 # and TreeBuilder
64
65#---------------------------------------------------------------------------
66
67=item hashset %HTML::Tagset::optionalEndTag
68
69This hashset lists tag-names for elements that can have content, but whose
70end-tags are generally, "safely", omissible. Example:
71C<$HTML::Tagset::emptyElement{'li'}> exists and is true.
72
73=cut
74
75%optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td);
76
77#---------------------------------------------------------------------------
78
79=item hash %HTML::Tagset::linkElements
80
81Values in this hash are tagnames for elements that might contain
82links, and the value for each is a reference to an array of the names
83of attributes whose values can be links.
84
85
86=cut
87
88%linkElements =
89(
90 'a' => ['href'],
91 'applet' => ['archive', 'codebase', 'code'],
92 'area' => ['href'],
93 'base' => ['href'],
94 'bgsound' => ['src'],
95 'blockquote' => ['cite'],
96 'body' => ['background'],
97 'del' => ['cite'],
98 'embed' => ['pluginspage', 'src'],
99 'form' => ['action'],
100 'frame' => ['src', 'longdesc'],
101 'iframe' => ['src', 'longdesc'],
102 'ilayer' => ['background'],
103 'img' => ['src', 'lowsrc', 'longdesc', 'usemap'],
104 'input' => ['src', 'usemap'],
105 'ins' => ['cite'],
106 'isindex' => ['action'],
107 'head' => ['profile'],
108 'layer' => ['background', 'src'],
109 'link' => ['href'],
110 'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'],
111 'q' => ['cite'],
112 'script' => ['src', 'for'],
113 'table' => ['background'],
114 'td' => ['background'],
115 'th' => ['background'],
116 'tr' => ['background'],
117 'xmp' => ['href'],
118);
119
120#---------------------------------------------------------------------------
121
122=item hash %HTML::Tagset::boolean_attr
123
124This hash (not hashset) lists what attributes of what elements can be
125printed without showing the value (for example, the "noshade" attribute
126of "hr" elements). For elements with only one such attribute, its value
127is simply that attribute name. For elements with many such attributes,
128the value is a reference to a hashset containing all such attributes.
129
130=cut
131
132%boolean_attr = (
133# TODO: make these all hashes
134 'area' => 'nohref',
135 'dir' => 'compact',
136 'dl' => 'compact',
137 'hr' => 'noshade',
138 'img' => 'ismap',
139 'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 },
140 'menu' => 'compact',
141 'ol' => 'compact',
142 'option' => 'selected',
143 'select' => 'multiple',
144 'td' => 'nowrap',
145 'th' => 'nowrap',
146 'ul' => 'compact',
147);
148
149#==========================================================================
150# List of all elements from Extensible HTML version 1.0 Transitional DTD:
151#
152# a abbr acronym address applet area b base basefont bdo big
153# blockquote body br button caption center cite code col colgroup
154# dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6
155# head hr html i iframe img input ins isindex kbd label legend li
156# link map menu meta noframes noscript object ol optgroup option p
157# param pre q s samp script select small span strike strong style
158# sub sup table tbody td textarea tfoot th thead title tr tt u ul
159# var
160#
161# Varia from Mozilla source internal table of tags:
162# Implemented:
163# xmp listing wbr nobr frame frameset noframes ilayer
164# layer nolayer spacer embed multicol
165# But these are unimplemented:
166# sound?? keygen?? server??
167# Also seen here and there:
168# marquee?? app?? (both unimplemented)
169#==========================================================================
170
171=item hashset %HTML::Tagset::isPhraseMarkup
172
173This hashset contains all phrasal-level elements.
174
175=cut
176
177%isPhraseMarkup = map {; $_ => 1 } qw(
178 span abbr acronym q sub sup
179 cite code em kbd samp strong var dfn strike
180 b i u s tt small big
181 a img br
182 wbr nobr blink
183 font basefont bdo
184 spacer embed noembed
185); # had: center, hr, table
186
187
188=item hashset %HTML::Tagset::is_Possible_Strict_P_Content
189
190This hashset contains all phrasal-level elements that be content of a
191P element, for a strict model of HTML.
192
193=cut
194
195%is_Possible_Strict_P_Content = (
196 %isPhraseMarkup,
197 %isFormElement,
198 map {; $_ => 1} qw( object script map )
199 # I've no idea why there's these latter exceptions.
200 # I'm just following the HTML4.01 DTD.
201);
202
203#from html4 strict:
204#<!ENTITY % fontstyle "TT | I | B | BIG | SMALL">
205#
206#<!ENTITY % phrase "EM | STRONG | DFN | CODE |
207# SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
208#
209#<!ENTITY % special
210# "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
211#
212#<!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON">
213#
214#<!-- %inline; covers inline or "text-level" elements -->
215#<!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;">
216
217=item hashset %HTML::Tagset::isHeadElement
218
219This hashset contains all elements that elements that should be
220present only in the 'head' element of an HTML document.
221
222=cut
223
224%isHeadElement = map {; $_ => 1 }
225 qw(title base link meta isindex script style object bgsound);
226
227=item hashset %HTML::Tagset::isList
228
229This hashset contains all elements that can contain "li" elements.
230
231=cut
232
233%isList = map {; $_ => 1 } qw(ul ol dir menu);
234
235=item hashset %HTML::Tagset::isTableElement
236
237This hashset contains all elements that are to be found only in/under
238a "table" element.
239
240=cut
241
242%isTableElement = map {; $_ => 1 }
243 qw(tr td th thead tbody tfoot caption col colgroup);
244
245=item hashset %HTML::Tagset::isFormElement
246
247This hashset contains all elements that are to be found only in/under
248a "form" element.
249
250=cut
251
252%isFormElement = map {; $_ => 1 }
253 qw(input select option optgroup textarea button label);
254
255=item hashset %HTML::Tagset::isBodyMarkup
256
257This hashset contains all elements that are to be found only in/under
258the "body" element of an HTML document.
259
260=cut
261
262%isBodyElement = map {; $_ => 1 } qw(
263 h1 h2 h3 h4 h5 h6
264 p div pre plaintext address blockquote
265 xmp listing
266 center
267
268 multicol
269 iframe ilayer nolayer
270 bgsound
271
272 hr
273 ol ul dir menu li
274 dl dt dd
275 ins del
276
277 fieldset legend
278
279 map area
280 applet param object
281 isindex script noscript
282 table
283 center
284 form
285 ),
286 keys %isFormElement,
287 keys %isPhraseMarkup, # And everything phrasal
288 keys %isTableElement,
289;
290
291
292=item hashset %HTML::Tagset::isHeadOrBodyElement
293
294This hashset includes all elements that I notice can fall either in
295the head or in the body.
296
297=cut
298
299%isHeadOrBodyElement = map {; $_ => 1 }
300 qw(script isindex style object map area param noscript bgsound);
301 # i.e., if we find 'script' in the 'body' or the 'head', don't freak out.
302
303
304=item hashset %HTML::Tagset::isKnown
305
306This hashset lists all known HTML elements.
307
308=cut
309
310%isKnown = (%isHeadElement, %isBodyElement,
311 map{; $_=>1 }
312 qw( head body html
313 frame frameset noframes
314 ~comment ~pi ~directive ~literal
315));
316 # that should be all known tags ever ever
317
318
319=item hashset %HTML::Tagset::canTighten
320
321This hashset lists elements that might have ignorable whitespace as
322children or siblings.
323
324=cut
325
326%canTighten = %isKnown;
327delete @canTighten{
328 keys(%isPhraseMarkup), 'input', 'select',
329 'xmp', 'listing', 'plaintext', 'pre',
330};
331 # xmp, listing, plaintext, and pre are untightenable, and
332 # in a really special way.
333@canTighten{'hr','br'} = (1,1);
334 # exceptional 'phrasal' things that ARE subject to tightening.
335
336# The one case where I can think of my tightening rules failing is:
337# <p>foo bar<center> <em>baz quux</em> ...
338# ^-- that would get deleted.
339# But that's pretty gruesome code anyhow. You gets what you pays for.
340
341#==========================================================================
342
343=item array @HTML::Tagset::p_closure_barriers
344
345This array has a meaning that I have only seen a need for in
346C<HTML::TreeBuilder>, but I include it here on the off chance that someone
347might find it of use:
348
349When we see a "E<lt>pE<gt>" token, we go lookup up the lineage for a p
350element we might have to minimize. At first sight, we might say that
351if there's a p anywhere in the lineage of this new p, it should be
352closed. But that's wrong. Consider this document:
353
354 <html>
355 <head>
356 <title>foo</title>
357 </head>
358 <body>
359 <p>foo
360 <table>
361 <tr>
362 <td>
363 foo
364 <p>bar
365 </td>
366 </tr>
367 </table>
368 </p>
369 </body>
370 </html>
371
372The second p is quite legally inside a much higher p.
373
374My formalization of the reason why this is legal, but this:
375
376 <p>foo<p>bar</p></p>
377
378isn't, is that something about the table constitutes a "barrier" to
379the application of the rule about what p must minimize.
380
381So C<@HTML::Tagset::p_closure_barriers> is the list of all such
382barrier-tags.
383
384=cut
385
386@p_closure_barriers = qw(
387 li blockquote
388 ul ol menu dir
389 dl dt dd
390 td th tr table caption
391 );
392
393# In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this
394# monkey business of barriers to minimization!
395
396###########################################################################
397
398=item hashset %isCDATA_Parent
399
400This hashset includes all elements whose content is CDATA.
401
402=cut
403
404%isCDATA_Parent = map {; $_ => 1 }
405 qw(script style xmp listing plaintext);
406
407# TODO: there's nothing else that takes CDATA children, right?
408
409# As the HTML3 DTD (Raggett 1995-04-24) noted:
410# The XMP, LISTING and PLAINTEXT tags are incompatible with SGML
411# and derive from very early versions of HTML. They require non-
412# standard parsers and will cause problems for processing
413# documents with standard SGML tools.
414
415
416
417###########################################################################
418
419=back
420
421=head1 CAVEATS
422
423You may find it useful to alter the behavior of modules (like
424C<HTML::Element> or C<HTML::TreeBuilder>) that use C<HTML::Tagset>'s
425data tables by altering the data tables themselves. You are welcome
426to try, but be careful; and be aware that different modules may or may
427react differently to the data tables being changed.
428
429Note that it may be inappropriate to use these tables for I<producing>
430HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames
431for all elements that can appear either in the head or in the body,
432such as "script". That doesn't mean that I am saying your code that
433produces HTML should feel free to put script elements in either place!
434If you are producing programs that spit out HTML, you should be
435I<intimately> familiar with the DTDs for HTML or XHTML (available at
436C<http://www.w3.org/>), and you should slavishly obey them, not
437the data tables in this document.
438
439=head1 SEE ALSO
440
441L<HTML::Element>, L<HTML::TreeBuilder>, L<HTML::LinkExtor>
442
443=head1 COPYRIGHT
444
445Copyright 1995-2000 Gisle Aas; copyright 2000 Sean M. Burke.
446
447This library is free software; you can redistribute it and/or
448modify it under the same terms as Perl itself.
449
450=head1 AUTHOR
451
452Current maintainer: Sean M. Burke, E<lt>sburke@cpan.orgE<gt>
453
454Most of the code/data in this module was adapted from code written by
455Gisle Aas E<lt>gisle@aas.noE<gt> for C<HTML::Element>,
456C<HTML::TreeBuilder>, and C<HTML::LinkExtor>.
457
458=cut
459
4601;