Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | package Encode::MIME::Header; |
2 | use strict; | |
3 | # use warnings; | |
4 | our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; | |
5 | use Encode qw(find_encoding encode_utf8 decode_utf8); | |
6 | use MIME::Base64; | |
7 | use Carp; | |
8 | ||
9 | my %seed = | |
10 | ( | |
11 | decode_b => '1', # decodes 'B' encoding ? | |
12 | decode_q => '1', # decodes 'Q' encoding ? | |
13 | encode => 'B', # encode with 'B' or 'Q' ? | |
14 | bpl => 75, # bytes per line | |
15 | ); | |
16 | ||
17 | $Encode::Encoding{'MIME-Header'} = | |
18 | bless { | |
19 | %seed, | |
20 | Name => 'MIME-Header', | |
21 | } => __PACKAGE__; | |
22 | ||
23 | $Encode::Encoding{'MIME-B'} = | |
24 | bless { | |
25 | %seed, | |
26 | decode_q => 0, | |
27 | Name => 'MIME-B', | |
28 | } => __PACKAGE__; | |
29 | ||
30 | $Encode::Encoding{'MIME-Q'} = | |
31 | bless { | |
32 | %seed, | |
33 | decode_q => 1, | |
34 | encode => 'Q', | |
35 | Name => 'MIME-Q', | |
36 | } => __PACKAGE__; | |
37 | ||
38 | use base qw(Encode::Encoding); | |
39 | ||
40 | sub needs_lines { 1 } | |
41 | sub perlio_ok{ 0 }; | |
42 | ||
43 | sub decode($$;$){ | |
44 | use utf8; | |
45 | my ($obj, $str, $chk) = @_; | |
46 | # zap spaces between encoded words | |
47 | $str =~ s/\?=\s+=\?/\?==\?/gos; | |
48 | # multi-line header to single line | |
49 | $str =~ s/(:?\r|\n|\r\n)[ \t]//gos; | |
50 | $str =~ | |
51 | s{ | |
52 | =\? # begin encoded word | |
53 | ([0-9A-Za-z\-_]+) # charset (encoding) | |
54 | \?([QqBb])\? # delimiter | |
55 | (.*?) # Base64-encodede contents | |
56 | \?= # end encoded word | |
57 | }{ | |
58 | if (uc($2) eq 'B'){ | |
59 | $obj->{decode_b} or croak qq(MIME "B" unsupported); | |
60 | decode_b($1, $3); | |
61 | }elsif(uc($2) eq 'Q'){ | |
62 | $obj->{decode_q} or croak qq(MIME "Q" unsupported); | |
63 | decode_q($1, $3); | |
64 | }else{ | |
65 | croak qq(MIME "$2" encoding is nonexistent!); | |
66 | } | |
67 | }egox; | |
68 | $_[1] = '' if $chk; | |
69 | return $str; | |
70 | } | |
71 | ||
72 | sub decode_b{ | |
73 | my $enc = shift; | |
74 | my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); | |
75 | my $db64 = decode_base64(shift); | |
76 | return $d->name eq 'utf8' ? | |
77 | Encode::decode_utf8($db64) : $d->decode($db64, Encode::FB_PERLQQ); | |
78 | } | |
79 | ||
80 | sub decode_q{ | |
81 | my ($enc, $q) = @_; | |
82 | my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); | |
83 | $q =~ s/_/ /go; | |
84 | $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego; | |
85 | return $d->name eq 'utf8' ? | |
86 | Encode::decode_utf8($q) : $d->decode($q, Encode::FB_PERLQQ); | |
87 | } | |
88 | ||
89 | my $especials = | |
90 | join('|' => | |
91 | map {quotemeta(chr($_))} | |
92 | unpack("C*", qq{()<>@,;:\"\'/[]?.=})); | |
93 | ||
94 | my $re_encoded_word = | |
95 | qr{ | |
96 | (?: | |
97 | =\? # begin encoded word | |
98 | (?:[0-9A-Za-z\-_]+) # charset (encoding) | |
99 | \?(?:[QqBb])\? # delimiter | |
100 | (?:.*?) # Base64-encodede contents | |
101 | \?= # end encoded word | |
102 | ) | |
103 | }xo; | |
104 | ||
105 | my $re_especials = qr{$re_encoded_word|$especials}xo; | |
106 | ||
107 | sub encode($$;$){ | |
108 | my ($obj, $str, $chk) = @_; | |
109 | my @line = (); | |
110 | for my $line (split /\r|\n|\r\n/o, $str){ | |
111 | my (@word, @subline); | |
112 | for my $word (split /($re_especials)/o, $line){ | |
113 | if ($word =~ /[^\x00-\x7f]/o or $word =~ /^$re_encoded_word$/o){ | |
114 | push @word, $obj->_encode($word); | |
115 | }else{ | |
116 | push @word, $word; | |
117 | } | |
118 | } | |
119 | my $subline = ''; | |
120 | for my $word (@word){ | |
121 | use bytes (); | |
122 | if (bytes::length($subline) + bytes::length($word) > $obj->{bpl}){ | |
123 | push @subline, $subline; | |
124 | $subline = ''; | |
125 | } | |
126 | $subline .= $word; | |
127 | } | |
128 | $subline and push @subline, $subline; | |
129 | push @line, join("\n " => @subline); | |
130 | } | |
131 | $_[1] = '' if $chk; | |
132 | return join("\n", @line); | |
133 | } | |
134 | ||
135 | use constant HEAD => '=?UTF-8?'; | |
136 | use constant TAIL => '?='; | |
137 | use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, }; | |
138 | ||
139 | sub _encode{ | |
140 | my ($o, $str) = @_; | |
141 | my $enc = $o->{encode}; | |
142 | my $llen = ($o->{bpl} - length(HEAD) - 2 - length(TAIL)); | |
143 | # to coerce a floating-point arithmetics, the following contains | |
144 | # .0 in numbers -- dankogai | |
145 | $llen *= $enc eq 'B' ? 3.0/4.0 : 1.0/3.0; | |
146 | my @result = (); | |
147 | my $chunk = ''; | |
148 | while(length(my $chr = substr($str, 0, 1, ''))){ | |
149 | use bytes (); | |
150 | if (bytes::length($chunk) + bytes::length($chr) > $llen){ | |
151 | push @result, SINGLE->{$enc}($chunk); | |
152 | $chunk = ''; | |
153 | } | |
154 | $chunk .= $chr; | |
155 | } | |
156 | $chunk and push @result, SINGLE->{$enc}($chunk); | |
157 | return @result; | |
158 | } | |
159 | ||
160 | sub _encode_b{ | |
161 | HEAD . 'B?' . encode_base64(encode_utf8(shift), '') . TAIL; | |
162 | } | |
163 | ||
164 | sub _encode_q{ | |
165 | my $chunk = shift; | |
166 | $chunk =~ s{ | |
167 | ([^0-9A-Za-z]) | |
168 | }{ | |
169 | join("" => map {sprintf "=%02X", $_} unpack("C*", $1)) | |
170 | }egox; | |
171 | return decode_utf8(HEAD . 'Q?' . $chunk . TAIL); | |
172 | } | |
173 | ||
174 | 1; | |
175 | __END__ | |
176 | ||
177 | =head1 NAME | |
178 | ||
179 | Encode::MIME::Header -- MIME 'B' and 'Q' header encoding | |
180 | ||
181 | =head1 SYNOPSIS | |
182 | ||
183 | use Encode qw/encode decode/; | |
184 | $utf8 = decode('MIME-Header', $header); | |
185 | $header = encode('MIME-Header', $utf8); | |
186 | ||
187 | =head1 ABSTRACT | |
188 | ||
189 | This module implements RFC 2047 Mime Header Encoding. There are 3 | |
190 | variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>. The | |
191 | difference is described below | |
192 | ||
193 | decode() encode() | |
194 | ---------------------------------------------- | |
195 | MIME-Header Both B and Q =?UTF-8?B?....?= | |
196 | MIME-B B only; Q croaks =?UTF-8?B?....?= | |
197 | MIME-Q Q only; B croaks =?UTF-8?Q?....?= | |
198 | ||
199 | =head1 DESCRIPTION | |
200 | ||
201 | When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD> | |
202 | is extracted and decoded for I<X> encoding (B for Base64, Q for | |
203 | Quoted-Printable). Then the decoded chunk is fed to | |
204 | decode(I<encoding>). So long as I<encoding> is supported by Encode, | |
205 | any source encoding is fine. | |
206 | ||
207 | When you encode, it just encodes UTF-8 string with I<X> encoding then | |
208 | quoted with =?UTF-8?I<X>?....?= . The parts that RFC 2047 forbids to | |
209 | encode are left as is and long lines are folded within 76 bytes per | |
210 | line. | |
211 | ||
212 | =head1 BUGS | |
213 | ||
214 | It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP? | |
215 | and =?ISO-8859-1?= but that makes the implementation too complicated. | |
216 | These days major mail agents all support =?UTF-8? so I think it is | |
217 | just good enough. | |
218 | ||
219 | Due to popular demand, 'MIME-Header-ISO_2022_JP' was introduced by | |
220 | Makamaka. Thre are still too many MUAs especially cellular phone | |
221 | handsets which does not grok UTF-8. | |
222 | ||
223 | =head1 SEE ALSO | |
224 | ||
225 | L<Encode> | |
226 | ||
227 | RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other | |
228 | locations. | |
229 | ||
230 | =cut |