| 1 | ##---------------------------------------------------------------------------## |
| 2 | ## File: |
| 3 | ## $Id: CharEnt.pm,v 1.3 2002/04/13 00:58:09 ehood Exp $ |
| 4 | ## Author: |
| 5 | ## Earl Hood earl@earlhood.com |
| 6 | ## Description: |
| 7 | ## Module to deal with 8-bit character data conversion to |
| 8 | ## (SGML) entity references. |
| 9 | ##---------------------------------------------------------------------------## |
| 10 | ## Copyright (C) 1997-2002 Earl Hood, earl@earlhood.com |
| 11 | ## |
| 12 | ## This program is free software; you can redistribute it and/or modify |
| 13 | ## it under the terms of the GNU General Public License as published by |
| 14 | ## the Free Software Foundation; either version 2 of the License, or |
| 15 | ## (at your option) any later version. |
| 16 | ## |
| 17 | ## This program is distributed in the hope that it will be useful, |
| 18 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 19 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 20 | ## GNU General Public License for more details. |
| 21 | ## |
| 22 | ## You should have received a copy of the GNU General Public License |
| 23 | ## along with this program; if not, write to the Free Software |
| 24 | ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA |
| 25 | ## 02111-1307, USA |
| 26 | ##---------------------------------------------------------------------------## |
| 27 | |
| 28 | package MHonArc::CharEnt; |
| 29 | |
| 30 | use strict; |
| 31 | |
| 32 | ##--------------------------------------------------------------------------- |
| 33 | ## US-ASCII/Common characters |
| 34 | ##--------------------------------------------------------------------------- |
| 35 | |
| 36 | my %ASCIIMap = ( |
| 37 | #-------------------------------------------------------------------------- |
| 38 | # Hex Code Entity Ref # ISO external entity and description |
| 39 | #-------------------------------------------------------------------------- |
| 40 | 0x22 => "quot", # ISOnum : Quotation mark |
| 41 | 0x26 => "amp", # ISOnum : Ampersand |
| 42 | 0x3C => "lt", # ISOnum : Less-than sign |
| 43 | 0x3E => "gt", # ISOnum : Greater-than sign |
| 44 | |
| 45 | 0xA0 => "nbsp", # ISOnum : NO-BREAK SPACE |
| 46 | ); |
| 47 | |
| 48 | my %ASCIIMapReverse = reverse %ASCIIMap; |
| 49 | |
| 50 | ##--------------------------------------------------------------------------- |
| 51 | ## Loaded Maps |
| 52 | ##--------------------------------------------------------------------------- |
| 53 | |
| 54 | # character => entity |
| 55 | my %char2ent_maps = ( |
| 56 | 'us-ascii' => \%ASCIIMap, |
| 57 | ); |
| 58 | # entity => character |
| 59 | my %ent2char_maps = ( |
| 60 | 'us-ascii' => \%ASCIIMapReverse, |
| 61 | ); |
| 62 | |
| 63 | ##--------------------------------------------------------------------------- |
| 64 | ## Charset specification to mapping |
| 65 | ##--------------------------------------------------------------------------- |
| 66 | |
| 67 | my %CharsetMaps = ( |
| 68 | 'iso-8859-1' => 'MHonArc/CharEnt/ISO8859_1.pm', |
| 69 | 'iso-8859-2' => 'MHonArc/CharEnt/ISO8859_2.pm', |
| 70 | 'iso-8859-3' => 'MHonArc/CharEnt/ISO8859_3.pm', |
| 71 | 'iso-8859-4' => 'MHonArc/CharEnt/ISO8859_4.pm', |
| 72 | 'iso-8859-5' => 'MHonArc/CharEnt/ISO8859_5.pm', |
| 73 | 'iso-8859-6' => 'MHonArc/CharEnt/ISO8859_6.pm', |
| 74 | 'iso-8859-7' => 'MHonArc/CharEnt/ISO8859_7.pm', |
| 75 | 'iso-8859-8' => 'MHonArc/CharEnt/ISO8859_8.pm', |
| 76 | 'iso-8859-9' => 'MHonArc/CharEnt/ISO8859_9.pm', |
| 77 | 'iso-8859-10' => 'MHonArc/CharEnt/ISO8859_10.pm', |
| 78 | 'iso-8859-15' => 'MHonArc/CharEnt/ISO8859_15.pm', |
| 79 | 'latin1' => 'MHonArc/CharEnt/ISO8859_1.pm', |
| 80 | 'latin2' => 'MHonArc/CharEnt/ISO8859_2.pm', |
| 81 | 'latin3' => 'MHonArc/CharEnt/ISO8859_3.pm', |
| 82 | 'latin4' => 'MHonArc/CharEnt/ISO8859_4.pm', |
| 83 | 'latin5' => 'MHonArc/CharEnt/ISO8859_9.pm', |
| 84 | 'latin6' => 'MHonArc/CharEnt/ISO8859_10.pm', |
| 85 | 'latin9' => 'MHonArc/CharEnt/ISO8859_15.pm', |
| 86 | 'windows-1250'=> 'MHonArc/CharEnt/CP1250.pm', |
| 87 | 'windows-1252'=> 'MHonArc/CharEnt/CP1252.pm', |
| 88 | ); |
| 89 | |
| 90 | my %ReverseCharsetMaps = ( |
| 91 | 'iso-8859-1' => 'MHonArc/CharEnt/ISO8859_1R.pm', |
| 92 | 'iso-8859-3' => 'MHonArc/CharEnt/ISO8859_3R.pm', |
| 93 | 'iso-8859-7' => 'MHonArc/CharEnt/ISO8859_7R.pm', |
| 94 | 'iso-8859-8' => 'MHonArc/CharEnt/ISO8859_8R.pm', |
| 95 | 'iso-8859-9' => 'MHonArc/CharEnt/ISO8859_9R.pm', |
| 96 | 'iso-8859-15' => 'MHonArc/CharEnt/ISO8859_15R.pm', |
| 97 | 'latin1' => 'MHonArc/CharEnt/ISO8859_1R.pm', |
| 98 | 'latin3' => 'MHonArc/CharEnt/ISO8859_3R.pm', |
| 99 | 'latin5' => 'MHonArc/CharEnt/ISO8859_9R.pm', |
| 100 | 'latin9' => 'MHonArc/CharEnt/ISO8859_15R.pm', |
| 101 | ); |
| 102 | |
| 103 | ############################################################################### |
| 104 | ## Routines |
| 105 | ############################################################################### |
| 106 | |
| 107 | ##---------------------------------------------------------------------------## |
| 108 | ## str2sgml converts a string encoded by $charset to an sgml |
| 109 | ## string where special characters are converted to entity |
| 110 | ## references. |
| 111 | ## |
| 112 | ## $return_data = MHonArc::CharEnt::str2sgml($data, $charset, $only8bit); |
| 113 | ## |
| 114 | ## If $only8bit is non-zero, than only 8-bit characters are |
| 115 | ## translated. |
| 116 | ## |
| 117 | sub str2sgml { |
| 118 | my $data = shift; |
| 119 | my $charset = lc shift; |
| 120 | my $only8bit = shift; |
| 121 | |
| 122 | my($ret, $offset, $len) = ('', 0, 0); |
| 123 | my($map, $char); |
| 124 | $charset =~ tr/_/-/; |
| 125 | |
| 126 | # Get mapping |
| 127 | $map = $char2ent_maps{$charset}; |
| 128 | $map = _load_charmap($charset) unless defined $map; |
| 129 | |
| 130 | # Convert string |
| 131 | $len = length($data); |
| 132 | while ($offset < $len) { |
| 133 | $char = unpack("C", substr($data, $offset++, 1)); |
| 134 | if ($only8bit && $char < 0xA0) { |
| 135 | $ret .= pack("C", $char); |
| 136 | } elsif ($map->{$char}) { |
| 137 | $ret .= join('', '&', $map->{$char}, ';'); |
| 138 | } elsif ($ASCIIMap{$char}) { |
| 139 | $ret .= join('', '&', $ASCIIMap{$char}, ';'); |
| 140 | } else { |
| 141 | $ret .= pack("C", $char); |
| 142 | } |
| 143 | } |
| 144 | $ret; |
| 145 | } |
| 146 | |
| 147 | ##---------------------------------------------------------------------------## |
| 148 | ## sgml2str converts a string with sdata character entity references |
| 149 | ## to the raw character values denoted by a character set. |
| 150 | ## |
| 151 | ## $return_data = MHonArc::CharEnt::sgml2str($data, $charset); |
| 152 | ## |
| 153 | sub sgml2str { |
| 154 | my $data = shift; |
| 155 | my $charset = lc shift; |
| 156 | my($map); |
| 157 | $charset =~ tr/_/-/; |
| 158 | |
| 159 | # Get mapping |
| 160 | $map = $ent2char_maps{$charset}; |
| 161 | $map = _reverse_load_charmap($charset) unless defined $map; |
| 162 | |
| 163 | # Convert character entites to raw values |
| 164 | $data =~ s/\&([\w\.\-]+); |
| 165 | /defined($map->{$1}) ? sprintf("%c", $map->{$1}) : |
| 166 | defined($ASCIIMapReverse{$1}) ? |
| 167 | sprintf("%c", $ASCIIMapReverse{$1}) : "&$1;" |
| 168 | /gex; |
| 169 | $data; |
| 170 | } |
| 171 | |
| 172 | ##---------------------------------------------------------------------------## |
| 173 | |
| 174 | sub _load_charmap { |
| 175 | my $charset = shift; |
| 176 | my $map = undef; |
| 177 | |
| 178 | my $file = $CharsetMaps{$charset}; |
| 179 | if (!defined($file)) { |
| 180 | warn 'Warning: MHonArc::CharEnt: Unknown charset: ', $charset, "\n"; |
| 181 | $map = $char2ent_maps{$charset} = { }; |
| 182 | |
| 183 | } else { |
| 184 | delete $INC{$file}; |
| 185 | eval { |
| 186 | $map = $char2ent_maps{$charset} = require $file; |
| 187 | }; |
| 188 | if ($@) { |
| 189 | warn 'Warning: MHonArc::CharEnt: ', $@, "\n"; |
| 190 | $map = $char2ent_maps{$charset} = { }; |
| 191 | } |
| 192 | } |
| 193 | $map; |
| 194 | } |
| 195 | |
| 196 | sub _reverse_load_charmap { |
| 197 | my $charset = shift; |
| 198 | my $map = undef; |
| 199 | |
| 200 | my $file = $ReverseCharsetMaps{$charset}; |
| 201 | if (!defined($file)) { |
| 202 | if (!defined($map = $char2ent_maps{$charset})) { |
| 203 | $map = _load_charmap($charset); |
| 204 | } |
| 205 | $map = $ent2char_maps{$charset} = { reverse %$map }; |
| 206 | |
| 207 | } else { |
| 208 | delete $INC{$file}; |
| 209 | eval { |
| 210 | $map = $ent2char_maps{$charset} = require $file; |
| 211 | }; |
| 212 | if ($@) { |
| 213 | warn 'Warning: MHonArc::CharEnt: ', $@, "\n"; |
| 214 | $map = $ent2char_maps{$charset} = { }; |
| 215 | } |
| 216 | } |
| 217 | $map; |
| 218 | } |
| 219 | |
| 220 | ##---------------------------------------------------------------------------## |
| 221 | 1; |
| 222 | |