[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / site_perl / 5.8.0 / MHonArc / UTF8.pm

##---------------------------------------------------------------------------##
##  File:
##	$Id: UTF8.pm,v 1.3 2002/07/30 05:10:30 ehood Exp $
##  Author:
##      Earl Hood       earl@earlhood.com
##  Description:
##	CHARSETCONVERTER module that support conversion to UTF-8 via
##	Unicode::MapUTF8 module.  It also requires versions of perl
##	that support 'use utf8' pragma.
##---------------------------------------------------------------------------##
##    Copyright (C) 2002	Earl Hood, earl@earlhood.com
##
##    This program is free software; you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation; either version 2 of the License, or
##    (at your option) any later version.
##
##    This program is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License
##    along with this program; if not, write to the Free Software
##    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
##    02111-1307, USA
##---------------------------------------------------------------------------##

package MHonArc::UTF8;

use strict;
use Unicode::String;
use Unicode::MapUTF8 qw(
    to_utf8 utf8_charset_alias utf8_supported_charset
);

BEGIN {
    utf8_charset_alias({ 'windows-1250' => 'cp1250' });
    utf8_charset_alias({ 'windows-1252' => 'cp1252' });
}

my %HTMLSpecials = (
    '"'	=> '&quot;',
    '&'	=> '&amp;',
    '<'	=> '&lt;',
    '>'	=> '&gt;',
);

sub entify {
    use utf8;
    my $str = shift;
    $str =~ s/(["&<>])/$HTMLSpecials{$1}/g;
    $str;
}

sub str2sgml{
    my $charset = lc($_[1]);
    my $str;

    if ($charset eq 'utf-8' || $charset eq 'utf8') {
	use utf8;
	($str = $_[0]) =~ s/(["&<>])/$HTMLSpecials{$1}/g;
	return $str;
    }

    if (utf8_supported_charset($charset)) {
	$str = to_utf8({-string => $_[0], -charset => $charset});
	{
	    use utf8;
	    $str =~ s/(["&<>])/$HTMLSpecials{$1}/g;
	}

    } else {
	warn qq/Warning: Unable to convert "$charset" to UTF-8\n/;
	($str = $_[0]) =~ s/(["&<>])/$HTMLSpecials{$1}/g;
    }
    $str;
}

sub clip {
    use utf8;
    my $str      = \shift;  # Prevent unnecessary copy.
    my $len      = shift;   # Clip length
    my $is_html  = shift;   # If entity references should be considered
    my $has_tags = shift;   # If html tags should be stripped

    my $u = Unicode::String::utf8($$str);

    if (!$is_html) {
      return $u->substr(0, $len);
    }

    my $text = Unicode::String::utf8("");
    my $subtext;
    my $html_len = $u->length;
    my($pos, $sublen, $erlen, $real_len);
    my $er_len = 0;
    
    for ( $pos=0, $sublen=$len; $pos < $html_len; ) {
	$subtext = $u->substr($pos, $sublen);
	$pos += $sublen;

	# strip tags
	if ($has_tags) {
	    # Strip full tags
	    $subtext =~ s/<[^>]*>//g;
	    # Check if clipped part of a tag
	    if ($subtext =~ s/<[^>]*\Z//) {
		my $gt = $u->index('>', $pos);
		$pos = ($gt < 0) ? $html_len : ($gt+1);
	    }
	}

	# check for clipped entity reference
	if (($pos < $html_len) && ($subtext =~ /\&[^;]*\Z/)) {
	    my $semi = $u->index(';', $pos);
	    if ($semi < 0) {
		# malformed entity reference
		$subtext .= $u->substr($pos);
		$pos = $html_len;
	    } else {
		$subtext .= $u->substr($pos, $semi-$pos+1);
		$pos = $semi+1;
	    }
	}

	# compute entity reference lengths to determine "real" character
	# count and not raw character count.
	while ($subtext =~ /(\&[^;]+);/g) {
	    $er_len += length($1);
	}

	$text .= $subtext;

	# done if we have enough
	$real_len = $text->length - $er_len;
	if ($real_len >= $len) {
	    last;
	}
	$sublen = $len - ($text->length - $er_len);
    }
    $text;
}

##---------------------------------------------------------------------------##
1;
__END__
Commit	Line	Data
86530b38 AT	1	##---------------------------------------------------------------------------##
	2	## File:
	3	## $Id: UTF8.pm,v 1.3 2002/07/30 05:10:30 ehood Exp $
	4	## Author:
	5	## Earl Hood earl@earlhood.com
	6	## Description:
	7	## CHARSETCONVERTER module that support conversion to UTF-8 via
	8	## Unicode::MapUTF8 module. It also requires versions of perl
	9	## that support 'use utf8' pragma.
	10	##---------------------------------------------------------------------------##
	11	## Copyright (C) 2002 Earl Hood, earl@earlhood.com
	12	##
	13	## This program is free software; you can redistribute it and/or modify
	14	## it under the terms of the GNU General Public License as published by
	15	## the Free Software Foundation; either version 2 of the License, or
	16	## (at your option) any later version.
	17	##
	18	## This program is distributed in the hope that it will be useful,
	19	## but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	## GNU General Public License for more details.
	22	##
	23	## You should have received a copy of the GNU General Public License
	24	## along with this program; if not, write to the Free Software
	25	## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
	26	## 02111-1307, USA
	27	##---------------------------------------------------------------------------##
	28
	29	package MHonArc::UTF8;
	30
	31	use strict;
	32	use Unicode::String;
	33	use Unicode::MapUTF8 qw(
	34	to_utf8 utf8_charset_alias utf8_supported_charset
	35	);
	36
	37	BEGIN {
	38	utf8_charset_alias({ 'windows-1250' => 'cp1250' });
	39	utf8_charset_alias({ 'windows-1252' => 'cp1252' });
	40	}
	41
	42	my %HTMLSpecials = (
	43	'"' => '"',
	44	'&' => '&',
	45	'<' => '<',
	46	'>' => '>',
	47	);
	48
	49	sub entify {
	50	use utf8;
	51	my $str = shift;
	52	$str =~ s/(["&<>])/$HTMLSpecials{$1}/g;
	53	$str;
	54	}
	55
	56	sub str2sgml{
	57	my $charset = lc($_[1]);
	58	my $str;
	59
	60	if ($charset eq 'utf-8' \|\| $charset eq 'utf8') {
	61	use utf8;
	62	($str = $_[0]) =~ s/(["&<>])/$HTMLSpecials{$1}/g;
	63	return $str;
	64	}
65
66	if (utf8_supported_charset($charset)) {
67	$str = to_utf8({-string => $_[0], -charset => $charset});
68	{
69	use utf8;
70	$str =~ s/(["&<>])/$HTMLSpecials{$1}/g;
71	}
72
73	} else {
74	warn qq/Warning: Unable to convert "$charset" to UTF-8\n/;
75	($str = $_[0]) =~ s/(["&<>])/$HTMLSpecials{$1}/g;
76	}
77	$str;
78	}
79
80	sub clip {
81	use utf8;
82	my $str = \shift; # Prevent unnecessary copy.
83	my $len = shift; # Clip length
84	my $is_html = shift; # If entity references should be considered
85	my $has_tags = shift; # If html tags should be stripped
86
87	my $u = Unicode::String::utf8($$str);
88
89	if (!$is_html) {
90	return $u->substr(0, $len);
91	}
92
93	my $text = Unicode::String::utf8("");
94	my $subtext;
95	my $html_len = $u->length;
96	my($pos, $sublen, $erlen, $real_len);
97	my $er_len = 0;
98
99	for ( $pos=0, $sublen=$len; $pos < $html_len; ) {
100	$subtext = $u->substr($pos, $sublen);
101	$pos += $sublen;
102
103	# strip tags
104	if ($has_tags) {
105	# Strip full tags
106	$subtext =~ s/<[^>]*>//g;
107	# Check if clipped part of a tag
108	if ($subtext =~ s/<[^>]*\Z//) {
109	my $gt = $u->index('>', $pos);
110	$pos = ($gt < 0) ? $html_len : ($gt+1);
111	}
112	}
113
114	# check for clipped entity reference
115	if (($pos < $html_len) && ($subtext =~ /\&[^;]*\Z/)) {
116	my $semi = $u->index(';', $pos);
117	if ($semi < 0) {
118	# malformed entity reference
119	$subtext .= $u->substr($pos);
120	$pos = $html_len;
121	} else {
122	$subtext .= $u->substr($pos, $semi-$pos+1);
123	$pos = $semi+1;
124	}
125	}
126
127	# compute entity reference lengths to determine "real" character
128	# count and not raw character count.
129	while ($subtext =~ /(\&[^;]+);/g) {
130	$er_len += length($1);
131	}
132
133	$text .= $subtext;
134
135	# done if we have enough
136	$real_len = $text->length - $er_len;
137	if ($real_len >= $len) {
138	last;
139	}
140	$sublen = $len - ($text->length - $er_len);
141	}
142	$text;
143	}
144
145	##---------------------------------------------------------------------------##
146	1;
147	__END__