Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | ##---------------------------------------------------------------------------## |
2 | ## File: | |
3 | ## $Id: CharEnt.pm,v 1.3 2002/04/13 00:58:09 ehood Exp $ | |
4 | ## Author: | |
5 | ## Earl Hood earl@earlhood.com | |
6 | ## Description: | |
7 | ## Module to deal with 8-bit character data conversion to | |
8 | ## (SGML) entity references. | |
9 | ##---------------------------------------------------------------------------## | |
10 | ## Copyright (C) 1997-2002 Earl Hood, earl@earlhood.com | |
11 | ## | |
12 | ## This program is free software; you can redistribute it and/or modify | |
13 | ## it under the terms of the GNU General Public License as published by | |
14 | ## the Free Software Foundation; either version 2 of the License, or | |
15 | ## (at your option) any later version. | |
16 | ## | |
17 | ## This program is distributed in the hope that it will be useful, | |
18 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | ## GNU General Public License for more details. | |
21 | ## | |
22 | ## You should have received a copy of the GNU General Public License | |
23 | ## along with this program; if not, write to the Free Software | |
24 | ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA | |
25 | ## 02111-1307, USA | |
26 | ##---------------------------------------------------------------------------## | |
27 | ||
28 | package MHonArc::CharEnt; | |
29 | ||
30 | use strict; | |
31 | ||
32 | ##--------------------------------------------------------------------------- | |
33 | ## US-ASCII/Common characters | |
34 | ##--------------------------------------------------------------------------- | |
35 | ||
36 | my %ASCIIMap = ( | |
37 | #-------------------------------------------------------------------------- | |
38 | # Hex Code Entity Ref # ISO external entity and description | |
39 | #-------------------------------------------------------------------------- | |
40 | 0x22 => "quot", # ISOnum : Quotation mark | |
41 | 0x26 => "amp", # ISOnum : Ampersand | |
42 | 0x3C => "lt", # ISOnum : Less-than sign | |
43 | 0x3E => "gt", # ISOnum : Greater-than sign | |
44 | ||
45 | 0xA0 => "nbsp", # ISOnum : NO-BREAK SPACE | |
46 | ); | |
47 | ||
48 | my %ASCIIMapReverse = reverse %ASCIIMap; | |
49 | ||
50 | ##--------------------------------------------------------------------------- | |
51 | ## Loaded Maps | |
52 | ##--------------------------------------------------------------------------- | |
53 | ||
54 | # character => entity | |
55 | my %char2ent_maps = ( | |
56 | 'us-ascii' => \%ASCIIMap, | |
57 | ); | |
58 | # entity => character | |
59 | my %ent2char_maps = ( | |
60 | 'us-ascii' => \%ASCIIMapReverse, | |
61 | ); | |
62 | ||
63 | ##--------------------------------------------------------------------------- | |
64 | ## Charset specification to mapping | |
65 | ##--------------------------------------------------------------------------- | |
66 | ||
67 | my %CharsetMaps = ( | |
68 | 'iso-8859-1' => 'MHonArc/CharEnt/ISO8859_1.pm', | |
69 | 'iso-8859-2' => 'MHonArc/CharEnt/ISO8859_2.pm', | |
70 | 'iso-8859-3' => 'MHonArc/CharEnt/ISO8859_3.pm', | |
71 | 'iso-8859-4' => 'MHonArc/CharEnt/ISO8859_4.pm', | |
72 | 'iso-8859-5' => 'MHonArc/CharEnt/ISO8859_5.pm', | |
73 | 'iso-8859-6' => 'MHonArc/CharEnt/ISO8859_6.pm', | |
74 | 'iso-8859-7' => 'MHonArc/CharEnt/ISO8859_7.pm', | |
75 | 'iso-8859-8' => 'MHonArc/CharEnt/ISO8859_8.pm', | |
76 | 'iso-8859-9' => 'MHonArc/CharEnt/ISO8859_9.pm', | |
77 | 'iso-8859-10' => 'MHonArc/CharEnt/ISO8859_10.pm', | |
78 | 'iso-8859-15' => 'MHonArc/CharEnt/ISO8859_15.pm', | |
79 | 'latin1' => 'MHonArc/CharEnt/ISO8859_1.pm', | |
80 | 'latin2' => 'MHonArc/CharEnt/ISO8859_2.pm', | |
81 | 'latin3' => 'MHonArc/CharEnt/ISO8859_3.pm', | |
82 | 'latin4' => 'MHonArc/CharEnt/ISO8859_4.pm', | |
83 | 'latin5' => 'MHonArc/CharEnt/ISO8859_9.pm', | |
84 | 'latin6' => 'MHonArc/CharEnt/ISO8859_10.pm', | |
85 | 'latin9' => 'MHonArc/CharEnt/ISO8859_15.pm', | |
86 | 'windows-1250'=> 'MHonArc/CharEnt/CP1250.pm', | |
87 | 'windows-1252'=> 'MHonArc/CharEnt/CP1252.pm', | |
88 | ); | |
89 | ||
90 | my %ReverseCharsetMaps = ( | |
91 | 'iso-8859-1' => 'MHonArc/CharEnt/ISO8859_1R.pm', | |
92 | 'iso-8859-3' => 'MHonArc/CharEnt/ISO8859_3R.pm', | |
93 | 'iso-8859-7' => 'MHonArc/CharEnt/ISO8859_7R.pm', | |
94 | 'iso-8859-8' => 'MHonArc/CharEnt/ISO8859_8R.pm', | |
95 | 'iso-8859-9' => 'MHonArc/CharEnt/ISO8859_9R.pm', | |
96 | 'iso-8859-15' => 'MHonArc/CharEnt/ISO8859_15R.pm', | |
97 | 'latin1' => 'MHonArc/CharEnt/ISO8859_1R.pm', | |
98 | 'latin3' => 'MHonArc/CharEnt/ISO8859_3R.pm', | |
99 | 'latin5' => 'MHonArc/CharEnt/ISO8859_9R.pm', | |
100 | 'latin9' => 'MHonArc/CharEnt/ISO8859_15R.pm', | |
101 | ); | |
102 | ||
103 | ############################################################################### | |
104 | ## Routines | |
105 | ############################################################################### | |
106 | ||
107 | ##---------------------------------------------------------------------------## | |
108 | ## str2sgml converts a string encoded by $charset to an sgml | |
109 | ## string where special characters are converted to entity | |
110 | ## references. | |
111 | ## | |
112 | ## $return_data = MHonArc::CharEnt::str2sgml($data, $charset, $only8bit); | |
113 | ## | |
114 | ## If $only8bit is non-zero, than only 8-bit characters are | |
115 | ## translated. | |
116 | ## | |
117 | sub str2sgml { | |
118 | my $data = shift; | |
119 | my $charset = lc shift; | |
120 | my $only8bit = shift; | |
121 | ||
122 | my($ret, $offset, $len) = ('', 0, 0); | |
123 | my($map, $char); | |
124 | $charset =~ tr/_/-/; | |
125 | ||
126 | # Get mapping | |
127 | $map = $char2ent_maps{$charset}; | |
128 | $map = _load_charmap($charset) unless defined $map; | |
129 | ||
130 | # Convert string | |
131 | $len = length($data); | |
132 | while ($offset < $len) { | |
133 | $char = unpack("C", substr($data, $offset++, 1)); | |
134 | if ($only8bit && $char < 0xA0) { | |
135 | $ret .= pack("C", $char); | |
136 | } elsif ($map->{$char}) { | |
137 | $ret .= join('', '&', $map->{$char}, ';'); | |
138 | } elsif ($ASCIIMap{$char}) { | |
139 | $ret .= join('', '&', $ASCIIMap{$char}, ';'); | |
140 | } else { | |
141 | $ret .= pack("C", $char); | |
142 | } | |
143 | } | |
144 | $ret; | |
145 | } | |
146 | ||
147 | ##---------------------------------------------------------------------------## | |
148 | ## sgml2str converts a string with sdata character entity references | |
149 | ## to the raw character values denoted by a character set. | |
150 | ## | |
151 | ## $return_data = MHonArc::CharEnt::sgml2str($data, $charset); | |
152 | ## | |
153 | sub sgml2str { | |
154 | my $data = shift; | |
155 | my $charset = lc shift; | |
156 | my($map); | |
157 | $charset =~ tr/_/-/; | |
158 | ||
159 | # Get mapping | |
160 | $map = $ent2char_maps{$charset}; | |
161 | $map = _reverse_load_charmap($charset) unless defined $map; | |
162 | ||
163 | # Convert character entites to raw values | |
164 | $data =~ s/\&([\w\.\-]+); | |
165 | /defined($map->{$1}) ? sprintf("%c", $map->{$1}) : | |
166 | defined($ASCIIMapReverse{$1}) ? | |
167 | sprintf("%c", $ASCIIMapReverse{$1}) : "&$1;" | |
168 | /gex; | |
169 | $data; | |
170 | } | |
171 | ||
172 | ##---------------------------------------------------------------------------## | |
173 | ||
174 | sub _load_charmap { | |
175 | my $charset = shift; | |
176 | my $map = undef; | |
177 | ||
178 | my $file = $CharsetMaps{$charset}; | |
179 | if (!defined($file)) { | |
180 | warn 'Warning: MHonArc::CharEnt: Unknown charset: ', $charset, "\n"; | |
181 | $map = $char2ent_maps{$charset} = { }; | |
182 | ||
183 | } else { | |
184 | delete $INC{$file}; | |
185 | eval { | |
186 | $map = $char2ent_maps{$charset} = require $file; | |
187 | }; | |
188 | if ($@) { | |
189 | warn 'Warning: MHonArc::CharEnt: ', $@, "\n"; | |
190 | $map = $char2ent_maps{$charset} = { }; | |
191 | } | |
192 | } | |
193 | $map; | |
194 | } | |
195 | ||
196 | sub _reverse_load_charmap { | |
197 | my $charset = shift; | |
198 | my $map = undef; | |
199 | ||
200 | my $file = $ReverseCharsetMaps{$charset}; | |
201 | if (!defined($file)) { | |
202 | if (!defined($map = $char2ent_maps{$charset})) { | |
203 | $map = _load_charmap($charset); | |
204 | } | |
205 | $map = $ent2char_maps{$charset} = { reverse %$map }; | |
206 | ||
207 | } else { | |
208 | delete $INC{$file}; | |
209 | eval { | |
210 | $map = $ent2char_maps{$charset} = require $file; | |
211 | }; | |
212 | if ($@) { | |
213 | warn 'Warning: MHonArc::CharEnt: ', $@, "\n"; | |
214 | $map = $ent2char_maps{$charset} = { }; | |
215 | } | |
216 | } | |
217 | $map; | |
218 | } | |
219 | ||
220 | ##---------------------------------------------------------------------------## | |
221 | 1; | |
222 |