##---------------------------------------------------------------------------##
## $Id: iso2022jp.pl,v 1.8 2002/07/30 18:20:46 ehood Exp $
## Earl Hood mhonarc@mhonarc.org
## NIIBE Yutaka gniibe@mri.co.jp
## Takashi P.KATOH p-katoh@shiratori.riec.tohoku.ac.jp
## Library defines routine to process iso-2022-jp data.
##---------------------------------------------------------------------------##
## Copyright (C) 1995-2002
## Earl Hood, mhonarc@mhonarc.org
## NIIBE Yutaka, gniibe@mri.co.jp
## Takashi P.KATOH, p-katoh@shiratori.riec.tohoku.ac.jp
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
##---------------------------------------------------------------------------##
$Url = '(http://|https://|ftp://|afs://|wais://|telnet://|ldap://' .
'|gopher://|news:|nntp:|mid:|cid:|mailto:|prospero:)';
$UrlExp = $Url . q
%[^\s\
(\
)\
|<>"']*[^\.?!;,"'\|\[\]\(\)\s<>]%;
$HUrlExp = $Url . q%[^\s\(\)\|<>"'\
&]*[^\
.?
!;,"'\|\[\]\(\)\s<>\&]%;
##---------------------------------------------------------------------------##
## str2html(): Convert an iso-2022-jp string into HTML. Function
## interface similiar as iso8859.pl function.
sub str2html { jp2022_to_html($_[0], 1); }
##---------------------------------------------------------------------------##
## Function to convert ISO-2022-JP data into HTML. Function is based
## on the following RFCs:
## J. Murai, M. Crispin, E. van der Poel, "Japanese Character
## Encoding for Internet Messages", 06/04/1993. (Pages=6)
## M. Ohta, K. Handa, "ISO-2022-JP-2: Multilingual Extension of
## ISO-2022-JP", 12/23/1993. (Pages=6)
my(@lines) = split(/\r?\n/,$body);
# a trick to process preceding ASCII text
$_ = "\033(B" . $_ unless /^\033/;
if (s/^(\033\([BJ])//) { # Single Byte Segment
if (s/^([^\033]+)//) { # ASCII plain text
# Replace meta characters in ASCII plain text
$ascii_text =~ s
%\
&%\
&
;%g;
$ascii_text =~ s
%<%\
<%g;
$ascii_text =~ s
%>%\
>%g;
## Convert URLs to hyperlinks
$ascii_text =~ s
%($HUrlExp)%<a href
="$1">$1</a
>%gio
} elsif (s/(\033\.[A-F])//) { # G2 Designate Sequence
} elsif (s/(\033N[ -\7f])//) { # Single Shift Sequence
} elsif (s/^(\033\$[\@AB]|\033\$\([CD])//) { # Double Byte Segment
if (s/^([!-~][!-~]+)//) { # Double Char plain text
} elsif (s/(\033\.[A-F])//) { # G2 Designate Sequence
} elsif (s/(\033N[ -\7f])//) { # Single Shift Sequence
# Something wrong in text
##---------------------------------------------------------------------------##
## clip($str, $length, $is_html, $has_tags): Clip an iso-2022-jp string.
## The last argument $is_html specifies '&' should be treated
## as HTML character or not.
## (i.e., the length of '&' will be 1 if $is_html).
sub clip
{ # &clip($str, 10, 1, 1);
# a trick to process preceding ASCII text
$_ = "\033(B" . $_ unless /^\033/;
if (s/^(\033\([BJ])//) { # Single Byte Segment
if (s/^([^\033])//) { # ASCII plain text
if (($1 eq '<') && $has_tags) {
} elsif (s/(\033\.[A-F])//) { # G2 Designate Sequence
} elsif (s/(\033N[ -\7f])//) { # Single Shift Sequence
last CLIP
if ($length <= 0);
} elsif (s/^(\033\$[\@AB]|\033\$\([CD])//) { # Double Byte Segment
if (s/^([!-~][!-~])//) { # Double Char plain text
# The length of a double-byte-char is assumed 2.
# If we consider compatibility with UTF-8, it should be 1.
} elsif (s/(\033\.[A-F])//) { # G2 Designate Sequence
} elsif (s/(\033N[ -\7f])//) { # Single Shift Sequence
last CLIP
if ($length <= 0);
# Something wrong in text
# Shuold we check the last \033\([BJ] sequence?
# (I believe it is too paranoid).
$ret .= "\033(B" unless $inascii;
##---------------------------------------------------------------------------##