Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / site_perl / 5.8.0 / MHonArc / UTF8.pm
CommitLineData
86530b38
AT
1##---------------------------------------------------------------------------##
2## File:
3## $Id: UTF8.pm,v 1.3 2002/07/30 05:10:30 ehood Exp $
4## Author:
5## Earl Hood earl@earlhood.com
6## Description:
7## CHARSETCONVERTER module that support conversion to UTF-8 via
8## Unicode::MapUTF8 module. It also requires versions of perl
9## that support 'use utf8' pragma.
10##---------------------------------------------------------------------------##
11## Copyright (C) 2002 Earl Hood, earl@earlhood.com
12##
13## This program is free software; you can redistribute it and/or modify
14## it under the terms of the GNU General Public License as published by
15## the Free Software Foundation; either version 2 of the License, or
16## (at your option) any later version.
17##
18## This program is distributed in the hope that it will be useful,
19## but WITHOUT ANY WARRANTY; without even the implied warranty of
20## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21## GNU General Public License for more details.
22##
23## You should have received a copy of the GNU General Public License
24## along with this program; if not, write to the Free Software
25## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
26## 02111-1307, USA
27##---------------------------------------------------------------------------##
28
29package MHonArc::UTF8;
30
31use strict;
32use Unicode::String;
33use Unicode::MapUTF8 qw(
34 to_utf8 utf8_charset_alias utf8_supported_charset
35);
36
37BEGIN {
38 utf8_charset_alias({ 'windows-1250' => 'cp1250' });
39 utf8_charset_alias({ 'windows-1252' => 'cp1252' });
40}
41
42my %HTMLSpecials = (
43 '"' => '"',
44 '&' => '&',
45 '<' => '&lt;',
46 '>' => '&gt;',
47);
48
49sub entify {
50 use utf8;
51 my $str = shift;
52 $str =~ s/(["&<>])/$HTMLSpecials{$1}/g;
53 $str;
54}
55
56sub str2sgml{
57 my $charset = lc($_[1]);
58 my $str;
59
60 if ($charset eq 'utf-8' || $charset eq 'utf8') {
61 use utf8;
62 ($str = $_[0]) =~ s/(["&<>])/$HTMLSpecials{$1}/g;
63 return $str;
64 }
65
66 if (utf8_supported_charset($charset)) {
67 $str = to_utf8({-string => $_[0], -charset => $charset});
68 {
69 use utf8;
70 $str =~ s/(["&<>])/$HTMLSpecials{$1}/g;
71 }
72
73 } else {
74 warn qq/Warning: Unable to convert "$charset" to UTF-8\n/;
75 ($str = $_[0]) =~ s/(["&<>])/$HTMLSpecials{$1}/g;
76 }
77 $str;
78}
79
80sub clip {
81 use utf8;
82 my $str = \shift; # Prevent unnecessary copy.
83 my $len = shift; # Clip length
84 my $is_html = shift; # If entity references should be considered
85 my $has_tags = shift; # If html tags should be stripped
86
87 my $u = Unicode::String::utf8($$str);
88
89 if (!$is_html) {
90 return $u->substr(0, $len);
91 }
92
93 my $text = Unicode::String::utf8("");
94 my $subtext;
95 my $html_len = $u->length;
96 my($pos, $sublen, $erlen, $real_len);
97 my $er_len = 0;
98
99 for ( $pos=0, $sublen=$len; $pos < $html_len; ) {
100 $subtext = $u->substr($pos, $sublen);
101 $pos += $sublen;
102
103 # strip tags
104 if ($has_tags) {
105 # Strip full tags
106 $subtext =~ s/<[^>]*>//g;
107 # Check if clipped part of a tag
108 if ($subtext =~ s/<[^>]*\Z//) {
109 my $gt = $u->index('>', $pos);
110 $pos = ($gt < 0) ? $html_len : ($gt+1);
111 }
112 }
113
114 # check for clipped entity reference
115 if (($pos < $html_len) && ($subtext =~ /\&[^;]*\Z/)) {
116 my $semi = $u->index(';', $pos);
117 if ($semi < 0) {
118 # malformed entity reference
119 $subtext .= $u->substr($pos);
120 $pos = $html_len;
121 } else {
122 $subtext .= $u->substr($pos, $semi-$pos+1);
123 $pos = $semi+1;
124 }
125 }
126
127 # compute entity reference lengths to determine "real" character
128 # count and not raw character count.
129 while ($subtext =~ /(\&[^;]+);/g) {
130 $er_len += length($1);
131 }
132
133 $text .= $subtext;
134
135 # done if we have enough
136 $real_len = $text->length - $er_len;
137 if ($real_len >= $len) {
138 last;
139 }
140 $sublen = $len - ($text->length - $er_len);
141 }
142 $text;
143}
144
145##---------------------------------------------------------------------------##
1461;
147__END__