Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / site_perl / 5.8.0 / MHonArc / CharEnt.pm
CommitLineData
86530b38
AT
1##---------------------------------------------------------------------------##
2## File:
3## $Id: CharEnt.pm,v 1.3 2002/04/13 00:58:09 ehood Exp $
4## Author:
5## Earl Hood earl@earlhood.com
6## Description:
7## Module to deal with 8-bit character data conversion to
8## (SGML) entity references.
9##---------------------------------------------------------------------------##
10## Copyright (C) 1997-2002 Earl Hood, earl@earlhood.com
11##
12## This program is free software; you can redistribute it and/or modify
13## it under the terms of the GNU General Public License as published by
14## the Free Software Foundation; either version 2 of the License, or
15## (at your option) any later version.
16##
17## This program is distributed in the hope that it will be useful,
18## but WITHOUT ANY WARRANTY; without even the implied warranty of
19## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20## GNU General Public License for more details.
21##
22## You should have received a copy of the GNU General Public License
23## along with this program; if not, write to the Free Software
24## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
25## 02111-1307, USA
26##---------------------------------------------------------------------------##
27
28package MHonArc::CharEnt;
29
30use strict;
31
32##---------------------------------------------------------------------------
33## US-ASCII/Common characters
34##---------------------------------------------------------------------------
35
36my %ASCIIMap = (
37 #--------------------------------------------------------------------------
38 # Hex Code Entity Ref # ISO external entity and description
39 #--------------------------------------------------------------------------
40 0x22 => "quot", # ISOnum : Quotation mark
41 0x26 => "amp", # ISOnum : Ampersand
42 0x3C => "lt", # ISOnum : Less-than sign
43 0x3E => "gt", # ISOnum : Greater-than sign
44
45 0xA0 => "nbsp", # ISOnum : NO-BREAK SPACE
46);
47
48my %ASCIIMapReverse = reverse %ASCIIMap;
49
50##---------------------------------------------------------------------------
51## Loaded Maps
52##---------------------------------------------------------------------------
53
54# character => entity
55my %char2ent_maps = (
56 'us-ascii' => \%ASCIIMap,
57);
58# entity => character
59my %ent2char_maps = (
60 'us-ascii' => \%ASCIIMapReverse,
61);
62
63##---------------------------------------------------------------------------
64## Charset specification to mapping
65##---------------------------------------------------------------------------
66
67my %CharsetMaps = (
68 'iso-8859-1' => 'MHonArc/CharEnt/ISO8859_1.pm',
69 'iso-8859-2' => 'MHonArc/CharEnt/ISO8859_2.pm',
70 'iso-8859-3' => 'MHonArc/CharEnt/ISO8859_3.pm',
71 'iso-8859-4' => 'MHonArc/CharEnt/ISO8859_4.pm',
72 'iso-8859-5' => 'MHonArc/CharEnt/ISO8859_5.pm',
73 'iso-8859-6' => 'MHonArc/CharEnt/ISO8859_6.pm',
74 'iso-8859-7' => 'MHonArc/CharEnt/ISO8859_7.pm',
75 'iso-8859-8' => 'MHonArc/CharEnt/ISO8859_8.pm',
76 'iso-8859-9' => 'MHonArc/CharEnt/ISO8859_9.pm',
77 'iso-8859-10' => 'MHonArc/CharEnt/ISO8859_10.pm',
78 'iso-8859-15' => 'MHonArc/CharEnt/ISO8859_15.pm',
79 'latin1' => 'MHonArc/CharEnt/ISO8859_1.pm',
80 'latin2' => 'MHonArc/CharEnt/ISO8859_2.pm',
81 'latin3' => 'MHonArc/CharEnt/ISO8859_3.pm',
82 'latin4' => 'MHonArc/CharEnt/ISO8859_4.pm',
83 'latin5' => 'MHonArc/CharEnt/ISO8859_9.pm',
84 'latin6' => 'MHonArc/CharEnt/ISO8859_10.pm',
85 'latin9' => 'MHonArc/CharEnt/ISO8859_15.pm',
86 'windows-1250'=> 'MHonArc/CharEnt/CP1250.pm',
87 'windows-1252'=> 'MHonArc/CharEnt/CP1252.pm',
88);
89
90my %ReverseCharsetMaps = (
91 'iso-8859-1' => 'MHonArc/CharEnt/ISO8859_1R.pm',
92 'iso-8859-3' => 'MHonArc/CharEnt/ISO8859_3R.pm',
93 'iso-8859-7' => 'MHonArc/CharEnt/ISO8859_7R.pm',
94 'iso-8859-8' => 'MHonArc/CharEnt/ISO8859_8R.pm',
95 'iso-8859-9' => 'MHonArc/CharEnt/ISO8859_9R.pm',
96 'iso-8859-15' => 'MHonArc/CharEnt/ISO8859_15R.pm',
97 'latin1' => 'MHonArc/CharEnt/ISO8859_1R.pm',
98 'latin3' => 'MHonArc/CharEnt/ISO8859_3R.pm',
99 'latin5' => 'MHonArc/CharEnt/ISO8859_9R.pm',
100 'latin9' => 'MHonArc/CharEnt/ISO8859_15R.pm',
101);
102
103###############################################################################
104## Routines
105###############################################################################
106
107##---------------------------------------------------------------------------##
108## str2sgml converts a string encoded by $charset to an sgml
109## string where special characters are converted to entity
110## references.
111##
112## $return_data = MHonArc::CharEnt::str2sgml($data, $charset, $only8bit);
113##
114## If $only8bit is non-zero, than only 8-bit characters are
115## translated.
116##
117sub str2sgml {
118 my $data = shift;
119 my $charset = lc shift;
120 my $only8bit = shift;
121
122 my($ret, $offset, $len) = ('', 0, 0);
123 my($map, $char);
124 $charset =~ tr/_/-/;
125
126 # Get mapping
127 $map = $char2ent_maps{$charset};
128 $map = _load_charmap($charset) unless defined $map;
129
130 # Convert string
131 $len = length($data);
132 while ($offset < $len) {
133 $char = unpack("C", substr($data, $offset++, 1));
134 if ($only8bit && $char < 0xA0) {
135 $ret .= pack("C", $char);
136 } elsif ($map->{$char}) {
137 $ret .= join('', '&', $map->{$char}, ';');
138 } elsif ($ASCIIMap{$char}) {
139 $ret .= join('', '&', $ASCIIMap{$char}, ';');
140 } else {
141 $ret .= pack("C", $char);
142 }
143 }
144 $ret;
145}
146
147##---------------------------------------------------------------------------##
148## sgml2str converts a string with sdata character entity references
149## to the raw character values denoted by a character set.
150##
151## $return_data = MHonArc::CharEnt::sgml2str($data, $charset);
152##
153sub sgml2str {
154 my $data = shift;
155 my $charset = lc shift;
156 my($map);
157 $charset =~ tr/_/-/;
158
159 # Get mapping
160 $map = $ent2char_maps{$charset};
161 $map = _reverse_load_charmap($charset) unless defined $map;
162
163 # Convert character entites to raw values
164 $data =~ s/\&([\w\.\-]+);
165 /defined($map->{$1}) ? sprintf("%c", $map->{$1}) :
166 defined($ASCIIMapReverse{$1}) ?
167 sprintf("%c", $ASCIIMapReverse{$1}) : "&$1;"
168 /gex;
169 $data;
170}
171
172##---------------------------------------------------------------------------##
173
174sub _load_charmap {
175 my $charset = shift;
176 my $map = undef;
177
178 my $file = $CharsetMaps{$charset};
179 if (!defined($file)) {
180 warn 'Warning: MHonArc::CharEnt: Unknown charset: ', $charset, "\n";
181 $map = $char2ent_maps{$charset} = { };
182
183 } else {
184 delete $INC{$file};
185 eval {
186 $map = $char2ent_maps{$charset} = require $file;
187 };
188 if ($@) {
189 warn 'Warning: MHonArc::CharEnt: ', $@, "\n";
190 $map = $char2ent_maps{$charset} = { };
191 }
192 }
193 $map;
194}
195
196sub _reverse_load_charmap {
197 my $charset = shift;
198 my $map = undef;
199
200 my $file = $ReverseCharsetMaps{$charset};
201 if (!defined($file)) {
202 if (!defined($map = $char2ent_maps{$charset})) {
203 $map = _load_charmap($charset);
204 }
205 $map = $ent2char_maps{$charset} = { reverse %$map };
206
207 } else {
208 delete $INC{$file};
209 eval {
210 $map = $ent2char_maps{$charset} = require $file;
211 };
212 if ($@) {
213 warn 'Warning: MHonArc::CharEnt: ', $@, "\n";
214 $map = $ent2char_maps{$charset} = { };
215 }
216 }
217 $map;
218}
219
220##---------------------------------------------------------------------------##
2211;
222