Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | ##---------------------------------------------------------------------------## |
2 | ## File: | |
3 | ## $Id: iso2022jp.pl,v 1.8 2002/07/30 18:20:46 ehood Exp $ | |
4 | ## Author(s): | |
5 | ## Earl Hood mhonarc@mhonarc.org | |
6 | ## NIIBE Yutaka gniibe@mri.co.jp | |
7 | ## Takashi P.KATOH p-katoh@shiratori.riec.tohoku.ac.jp | |
8 | ## Description: | |
9 | ## Library defines routine to process iso-2022-jp data. | |
10 | ##---------------------------------------------------------------------------## | |
11 | ## Copyright (C) 1995-2002 | |
12 | ## Earl Hood, mhonarc@mhonarc.org | |
13 | ## NIIBE Yutaka, gniibe@mri.co.jp | |
14 | ## Takashi P.KATOH, p-katoh@shiratori.riec.tohoku.ac.jp | |
15 | ## | |
16 | ## This program is free software; you can redistribute it and/or modify | |
17 | ## it under the terms of the GNU General Public License as published by | |
18 | ## the Free Software Foundation; either version 2 of the License, or | |
19 | ## (at your option) any later version. | |
20 | ## | |
21 | ## This program is distributed in the hope that it will be useful, | |
22 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
23 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
24 | ## GNU General Public License for more details. | |
25 | ## | |
26 | ## You should have received a copy of the GNU General Public License | |
27 | ## along with this program; if not, write to the Free Software | |
28 | ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA | |
29 | ## 02111-1307, USA | |
30 | ##---------------------------------------------------------------------------## | |
31 | ||
32 | package iso_2022_jp; | |
33 | ||
34 | $Url = '(http://|https://|ftp://|afs://|wais://|telnet://|ldap://' . | |
35 | '|gopher://|news:|nntp:|mid:|cid:|mailto:|prospero:)'; | |
36 | $UrlExp = $Url . q%[^\s\(\)\|<>"']*[^\.?!;,"'\|\[\]\(\)\s<>]%; | |
37 | $HUrlExp = $Url . q%[^\s\(\)\|<>"'\&]*[^\.?!;,"'\|\[\]\(\)\s<>\&]%; | |
38 | ||
39 | ##---------------------------------------------------------------------------## | |
40 | ## str2html(): Convert an iso-2022-jp string into HTML. Function | |
41 | ## interface similiar as iso8859.pl function. | |
42 | ## | |
43 | sub str2html { jp2022_to_html($_[0], 1); } | |
44 | ||
45 | ##---------------------------------------------------------------------------## | |
46 | ## Function to convert ISO-2022-JP data into HTML. Function is based | |
47 | ## on the following RFCs: | |
48 | ## | |
49 | ## RFC-1468 I | |
50 | ## J. Murai, M. Crispin, E. van der Poel, "Japanese Character | |
51 | ## Encoding for Internet Messages", 06/04/1993. (Pages=6) | |
52 | ## | |
53 | ## RFC-1554 I | |
54 | ## M. Ohta, K. Handa, "ISO-2022-JP-2: Multilingual Extension of | |
55 | ## ISO-2022-JP", 12/23/1993. (Pages=6) | |
56 | ## | |
57 | sub jp2022_to_html { | |
58 | my($body) = shift; | |
59 | my($nourl) = shift; | |
60 | my(@lines) = split(/\r?\n/,$body); | |
61 | my($ret, $ascii_text); | |
62 | local($_); | |
63 | ||
64 | $ret = ""; | |
65 | foreach (@lines) { | |
66 | # a trick to process preceding ASCII text | |
67 | $_ = "\033(B" . $_ unless /^\033/; | |
68 | ||
69 | # Process Each Segment | |
70 | while(1) { | |
71 | if (s/^(\033\([BJ])//) { # Single Byte Segment | |
72 | $ret .= $1; | |
73 | while(1) { | |
74 | if (s/^([^\033]+)//) { # ASCII plain text | |
75 | $ascii_text = $1; | |
76 | ||
77 | # Replace meta characters in ASCII plain text | |
78 | $ascii_text =~ s%\&%\&%g; | |
79 | $ascii_text =~ s%<%\<%g; | |
80 | $ascii_text =~ s%>%\>%g; | |
81 | ## Convert URLs to hyperlinks | |
82 | $ascii_text =~ s%($HUrlExp)%<a href="$1">$1</a>%gio | |
83 | unless $nourl; | |
84 | ||
85 | $ret .= $ascii_text; | |
86 | } elsif (s/(\033\.[A-F])//) { # G2 Designate Sequence | |
87 | $ret .= $1; | |
88 | } elsif (s/(\033N[ -\7f])//) { # Single Shift Sequence | |
89 | $ret .= $1; | |
90 | } else { | |
91 | last; | |
92 | } | |
93 | } | |
94 | } elsif (s/^(\033\$[\@AB]|\033\$\([CD])//) { # Double Byte Segment | |
95 | $ret .= $1; | |
96 | while (1) { | |
97 | if (s/^([!-~][!-~]+)//) { # Double Char plain text | |
98 | $ret .= $1; | |
99 | } elsif (s/(\033\.[A-F])//) { # G2 Designate Sequence | |
100 | $ret .= $1; | |
101 | } elsif (s/(\033N[ -\7f])//) { # Single Shift Sequence | |
102 | $ret .= $1; | |
103 | } else { | |
104 | last; | |
105 | } | |
106 | } | |
107 | } else { | |
108 | # Something wrong in text | |
109 | $ret .= $_; | |
110 | last; | |
111 | } | |
112 | } | |
113 | ||
114 | # remove a `trick' | |
115 | $ret =~ s/^\033\(B//; | |
116 | ||
117 | $ret .= "\n"; | |
118 | } | |
119 | ||
120 | ($ret); | |
121 | } | |
122 | ||
123 | ||
124 | ##---------------------------------------------------------------------------## | |
125 | ## clip($str, $length, $is_html, $has_tags): Clip an iso-2022-jp string. | |
126 | ## | |
127 | ## The last argument $is_html specifies '&' should be treated | |
128 | ## as HTML character or not. | |
129 | ## (i.e., the length of '&' will be 1 if $is_html). | |
130 | ## | |
131 | sub clip { # &clip($str, 10, 1, 1); | |
132 | my($str) = shift; | |
133 | my($length) = shift; | |
134 | my($is_html) = shift; | |
135 | my($has_tags) = shift; | |
136 | my($ret, $inascii); | |
137 | local($_) = $str; | |
138 | ||
139 | $ret = ""; | |
140 | # a trick to process preceding ASCII text | |
141 | $_ = "\033(B" . $_ unless /^\033/; | |
142 | ||
143 | # Process Each Segment | |
144 | CLIP: while(1) { | |
145 | if (s/^(\033\([BJ])//) { # Single Byte Segment | |
146 | $inascii = 1; | |
147 | $ret .= $1; | |
148 | while(1) { | |
149 | if (s/^([^\033])//) { # ASCII plain text | |
150 | if ($is_html) { | |
151 | if (($1 eq '<') && $has_tags) { | |
152 | s/^[^>\033]*>//; | |
153 | } else { | |
154 | if ($1 eq '&') { | |
155 | s/^([^\;]*\;)//; | |
156 | $ret .= "&$1"; | |
157 | } else { | |
158 | $ret .= $1; | |
159 | } | |
160 | $length--; | |
161 | } | |
162 | } else { | |
163 | $ret .= $1; | |
164 | $length--; | |
165 | } | |
166 | } elsif (s/(\033\.[A-F])//) { # G2 Designate Sequence | |
167 | $ret .= $1; | |
168 | } elsif (s/(\033N[ -\7f])//) { # Single Shift Sequence | |
169 | $ret .= $1; | |
170 | $length--; | |
171 | } else { | |
172 | last; | |
173 | } | |
174 | last CLIP if ($length <= 0); | |
175 | } | |
176 | } elsif (s/^(\033\$[\@AB]|\033\$\([CD])//) { # Double Byte Segment | |
177 | $inascii = 0; | |
178 | $ret .= $1; | |
179 | while (1) { | |
180 | if (s/^([!-~][!-~])//) { # Double Char plain text | |
181 | $ret .= $1; | |
182 | # The length of a double-byte-char is assumed 2. | |
183 | # If we consider compatibility with UTF-8, it should be 1. | |
184 | $length -= 2; | |
185 | } elsif (s/(\033\.[A-F])//) { # G2 Designate Sequence | |
186 | $ret .= $1; | |
187 | } elsif (s/(\033N[ -\7f])//) { # Single Shift Sequence | |
188 | $ret .= $1; | |
189 | $length--; | |
190 | } else { | |
191 | last; | |
192 | } | |
193 | last CLIP if ($length <= 0); | |
194 | } | |
195 | } else { | |
196 | # Something wrong in text | |
197 | $ret .= $_; | |
198 | last; | |
199 | } | |
200 | } | |
201 | ||
202 | # remove a `trick' | |
203 | $ret =~ s/^\033\(B//; | |
204 | ||
205 | # Shuold we check the last \033\([BJ] sequence? | |
206 | # (I believe it is too paranoid). | |
207 | $ret .= "\033(B" unless $inascii; | |
208 | ||
209 | ($ret); | |
210 | } | |
211 | ##---------------------------------------------------------------------------## | |
212 | 1; |