Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | ##---------------------------------------------------------------------------## |
2 | ## File: | |
3 | ## $Id: UTF8.pm,v 1.3 2002/07/30 05:10:30 ehood Exp $ | |
4 | ## Author: | |
5 | ## Earl Hood earl@earlhood.com | |
6 | ## Description: | |
7 | ## CHARSETCONVERTER module that support conversion to UTF-8 via | |
8 | ## Unicode::MapUTF8 module. It also requires versions of perl | |
9 | ## that support 'use utf8' pragma. | |
10 | ##---------------------------------------------------------------------------## | |
11 | ## Copyright (C) 2002 Earl Hood, earl@earlhood.com | |
12 | ## | |
13 | ## This program is free software; you can redistribute it and/or modify | |
14 | ## it under the terms of the GNU General Public License as published by | |
15 | ## the Free Software Foundation; either version 2 of the License, or | |
16 | ## (at your option) any later version. | |
17 | ## | |
18 | ## This program is distributed in the hope that it will be useful, | |
19 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | ## GNU General Public License for more details. | |
22 | ## | |
23 | ## You should have received a copy of the GNU General Public License | |
24 | ## along with this program; if not, write to the Free Software | |
25 | ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA | |
26 | ## 02111-1307, USA | |
27 | ##---------------------------------------------------------------------------## | |
28 | ||
29 | package MHonArc::UTF8; | |
30 | ||
31 | use strict; | |
32 | use Unicode::String; | |
33 | use Unicode::MapUTF8 qw( | |
34 | to_utf8 utf8_charset_alias utf8_supported_charset | |
35 | ); | |
36 | ||
37 | BEGIN { | |
38 | utf8_charset_alias({ 'windows-1250' => 'cp1250' }); | |
39 | utf8_charset_alias({ 'windows-1252' => 'cp1252' }); | |
40 | } | |
41 | ||
42 | my %HTMLSpecials = ( | |
43 | '"' => '"', | |
44 | '&' => '&', | |
45 | '<' => '<', | |
46 | '>' => '>', | |
47 | ); | |
48 | ||
49 | sub entify { | |
50 | use utf8; | |
51 | my $str = shift; | |
52 | $str =~ s/(["&<>])/$HTMLSpecials{$1}/g; | |
53 | $str; | |
54 | } | |
55 | ||
56 | sub str2sgml{ | |
57 | my $charset = lc($_[1]); | |
58 | my $str; | |
59 | ||
60 | if ($charset eq 'utf-8' || $charset eq 'utf8') { | |
61 | use utf8; | |
62 | ($str = $_[0]) =~ s/(["&<>])/$HTMLSpecials{$1}/g; | |
63 | return $str; | |
64 | } | |
65 | ||
66 | if (utf8_supported_charset($charset)) { | |
67 | $str = to_utf8({-string => $_[0], -charset => $charset}); | |
68 | { | |
69 | use utf8; | |
70 | $str =~ s/(["&<>])/$HTMLSpecials{$1}/g; | |
71 | } | |
72 | ||
73 | } else { | |
74 | warn qq/Warning: Unable to convert "$charset" to UTF-8\n/; | |
75 | ($str = $_[0]) =~ s/(["&<>])/$HTMLSpecials{$1}/g; | |
76 | } | |
77 | $str; | |
78 | } | |
79 | ||
80 | sub clip { | |
81 | use utf8; | |
82 | my $str = \shift; # Prevent unnecessary copy. | |
83 | my $len = shift; # Clip length | |
84 | my $is_html = shift; # If entity references should be considered | |
85 | my $has_tags = shift; # If html tags should be stripped | |
86 | ||
87 | my $u = Unicode::String::utf8($$str); | |
88 | ||
89 | if (!$is_html) { | |
90 | return $u->substr(0, $len); | |
91 | } | |
92 | ||
93 | my $text = Unicode::String::utf8(""); | |
94 | my $subtext; | |
95 | my $html_len = $u->length; | |
96 | my($pos, $sublen, $erlen, $real_len); | |
97 | my $er_len = 0; | |
98 | ||
99 | for ( $pos=0, $sublen=$len; $pos < $html_len; ) { | |
100 | $subtext = $u->substr($pos, $sublen); | |
101 | $pos += $sublen; | |
102 | ||
103 | # strip tags | |
104 | if ($has_tags) { | |
105 | # Strip full tags | |
106 | $subtext =~ s/<[^>]*>//g; | |
107 | # Check if clipped part of a tag | |
108 | if ($subtext =~ s/<[^>]*\Z//) { | |
109 | my $gt = $u->index('>', $pos); | |
110 | $pos = ($gt < 0) ? $html_len : ($gt+1); | |
111 | } | |
112 | } | |
113 | ||
114 | # check for clipped entity reference | |
115 | if (($pos < $html_len) && ($subtext =~ /\&[^;]*\Z/)) { | |
116 | my $semi = $u->index(';', $pos); | |
117 | if ($semi < 0) { | |
118 | # malformed entity reference | |
119 | $subtext .= $u->substr($pos); | |
120 | $pos = $html_len; | |
121 | } else { | |
122 | $subtext .= $u->substr($pos, $semi-$pos+1); | |
123 | $pos = $semi+1; | |
124 | } | |
125 | } | |
126 | ||
127 | # compute entity reference lengths to determine "real" character | |
128 | # count and not raw character count. | |
129 | while ($subtext =~ /(\&[^;]+);/g) { | |
130 | $er_len += length($1); | |
131 | } | |
132 | ||
133 | $text .= $subtext; | |
134 | ||
135 | # done if we have enough | |
136 | $real_len = $text->length - $er_len; | |
137 | if ($real_len >= $len) { | |
138 | last; | |
139 | } | |
140 | $sublen = $len - ($text->length - $er_len); | |
141 | } | |
142 | $text; | |
143 | } | |
144 | ||
145 | ##---------------------------------------------------------------------------## | |
146 | 1; | |
147 | __END__ |