Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | package Encode::CN::HZ; |
2 | ||
3 | use strict; | |
4 | ||
5 | use vars qw($VERSION); | |
6 | $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; | |
7 | ||
8 | use Encode qw(:fallbacks); | |
9 | ||
10 | use base qw(Encode::Encoding); | |
11 | __PACKAGE__->Define('hz'); | |
12 | ||
13 | # HZ is a combination of ASCII and escaped GB, so we implement it | |
14 | # with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843. | |
15 | ||
16 | # not ported for EBCDIC. Which should be used, "~" or "\x7E"? | |
17 | ||
18 | sub needs_lines { 1 } | |
19 | ||
20 | sub decode ($$;$) | |
21 | { | |
22 | my ($obj,$str,$chk) = @_; | |
23 | ||
24 | my $GB = Encode::find_encoding('gb2312-raw'); | |
25 | my $ret = ''; | |
26 | my $in_ascii = 1; # default mode is ASCII. | |
27 | ||
28 | while (length $str) { | |
29 | if ($in_ascii) { # ASCII mode | |
30 | if ($str =~ s/^([\x00-\x7D\x7F]+)//) { # no '~' => ASCII | |
31 | $ret .= $1; | |
32 | # EBCDIC should need ascii2native, but not ported. | |
33 | } | |
34 | elsif ($str =~ s/^\x7E\x7E//) { # escaped tilde | |
35 | $ret .= '~'; | |
36 | } | |
37 | elsif ($str =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII | |
38 | 1; # no-op | |
39 | } | |
40 | elsif ($str =~ s/^\x7E\x7B//) { # '~{' | |
41 | $in_ascii = 0; # to GB | |
42 | } | |
43 | else { # encounters an invalid escape, \x80 or greater | |
44 | last; | |
45 | } | |
46 | } | |
47 | else { # GB mode; the byte ranges are as in RFC 1843. | |
48 | if ($str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)//) { | |
49 | $ret .= $GB->decode($1, $chk); | |
50 | } | |
51 | elsif ($str =~ s/^\x7E\x7D//) { # '~}' | |
52 | $in_ascii = 1; | |
53 | } | |
54 | else { # invalid | |
55 | last; | |
56 | } | |
57 | } | |
58 | } | |
59 | $_[1] = '' if $chk; # needs_lines guarantees no partial character | |
60 | return $ret; | |
61 | } | |
62 | ||
63 | sub cat_decode { | |
64 | my ($obj, undef, $src, $pos, $trm, $chk) = @_; | |
65 | my ($rdst, $rsrc, $rpos) = \@_[1..3]; | |
66 | ||
67 | my $GB = Encode::find_encoding('gb2312-raw'); | |
68 | my $ret = ''; | |
69 | my $in_ascii = 1; # default mode is ASCII. | |
70 | ||
71 | my $ini_pos = pos($$rsrc); | |
72 | ||
73 | substr($src, 0, $pos) = ''; | |
74 | ||
75 | my $ini_len = bytes::length($src); | |
76 | ||
77 | # $trm is the first of the pair '~~', then 2nd tilde is to be removed. | |
78 | # XXX: Is better C<$src =~ s/^\x7E// or die if ...>? | |
79 | $src =~ s/^\x7E// if $trm eq "\x7E"; | |
80 | ||
81 | while (length $src) { | |
82 | my $now; | |
83 | if ($in_ascii) { # ASCII mode | |
84 | if ($src =~ s/^([\x00-\x7D\x7F])//) { # no '~' => ASCII | |
85 | $now = $1; | |
86 | } | |
87 | elsif ($src =~ s/^\x7E\x7E//) { # escaped tilde | |
88 | $now = '~'; | |
89 | } | |
90 | elsif ($src =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII | |
91 | next; | |
92 | } | |
93 | elsif ($src =~ s/^\x7E\x7B//) { # '~{' | |
94 | $in_ascii = 0; # to GB | |
95 | next; | |
96 | } | |
97 | else { # encounters an invalid escape, \x80 or greater | |
98 | last; | |
99 | } | |
100 | } | |
101 | else { # GB mode; the byte ranges are as in RFC 1843. | |
102 | if ($src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)//) { | |
103 | $now = $GB->decode($1, $chk); | |
104 | } | |
105 | elsif ($src =~ s/^\x7E\x7D//) { # '~}' | |
106 | $in_ascii = 1; | |
107 | next; | |
108 | } | |
109 | else { # invalid | |
110 | last; | |
111 | } | |
112 | } | |
113 | ||
114 | next if ! defined $now; | |
115 | ||
116 | $ret .= $now; | |
117 | ||
118 | if ($now eq $trm) { | |
119 | $$rdst .= $ret; | |
120 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); | |
121 | pos($$rsrc) = $ini_pos; | |
122 | return 1; | |
123 | } | |
124 | } | |
125 | ||
126 | $$rdst .= $ret; | |
127 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); | |
128 | pos($$rsrc) = $ini_pos; | |
129 | return ''; # terminator not found | |
130 | } | |
131 | ||
132 | ||
133 | sub encode($$;$) | |
134 | { | |
135 | my ($obj,$str,$chk) = @_; | |
136 | ||
137 | my $GB = Encode::find_encoding('gb2312-raw'); | |
138 | my $ret = ''; | |
139 | my $in_ascii = 1; # default mode is ASCII. | |
140 | ||
141 | no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk. | |
142 | ||
143 | while (length $str) { | |
144 | if ($str =~ s/^([[:ascii:]]+)//) { | |
145 | my $tmp = $1; | |
146 | $tmp =~ s/~/~~/g; # escapes tildes | |
147 | if (! $in_ascii) { | |
148 | $ret .= "\x7E\x7D"; # '~}' | |
149 | $in_ascii = 1; | |
150 | } | |
151 | $ret .= pack 'a*', $tmp; # remove UTF8 flag. | |
152 | } | |
153 | elsif ($str =~ s/(.)//) { | |
154 | my $s = $1; | |
155 | my $tmp = $GB->encode($s, $chk); | |
156 | last if !defined $tmp; | |
157 | if (length $tmp == 2) { # maybe a valid GB char (XXX) | |
158 | if ($in_ascii) { | |
159 | $ret .= "\x7E\x7B"; # '~{' | |
160 | $in_ascii = 0; | |
161 | } | |
162 | $ret .= $tmp; | |
163 | } | |
164 | elsif (length $tmp) { # maybe FALLBACK in ASCII (XXX) | |
165 | if (!$in_ascii) { | |
166 | $ret .= "\x7E\x7D"; # '~}' | |
167 | $in_ascii = 1; | |
168 | } | |
169 | $ret .= $tmp; | |
170 | } | |
171 | } | |
172 | else { # if $str is malformed UTF8 *and* if length $str != 0. | |
173 | last; | |
174 | } | |
175 | } | |
176 | $_[1] = $str if $chk; | |
177 | ||
178 | # The state at the end of the chunk is discarded, even if in GB mode. | |
179 | # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{". | |
180 | # Parhaps it is harmless, but further investigations may be required... | |
181 | ||
182 | if (! $in_ascii) { | |
183 | $ret .= "\x7E\x7D"; # '~}' | |
184 | $in_ascii = 1; | |
185 | } | |
186 | return $ret; | |
187 | } | |
188 | ||
189 | 1; | |
190 | __END__ | |
191 | ||
192 | =head1 NAME | |
193 | ||
194 | Encode::CN::HZ -- internally used by Encode::CN | |
195 | ||
196 | =cut |