| 1 | package Encode::MIME::Header; |
| 2 | use strict; |
| 3 | # use warnings; |
| 4 | our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
| 5 | use Encode qw(find_encoding encode_utf8 decode_utf8); |
| 6 | use MIME::Base64; |
| 7 | use Carp; |
| 8 | |
| 9 | my %seed = |
| 10 | ( |
| 11 | decode_b => '1', # decodes 'B' encoding ? |
| 12 | decode_q => '1', # decodes 'Q' encoding ? |
| 13 | encode => 'B', # encode with 'B' or 'Q' ? |
| 14 | bpl => 75, # bytes per line |
| 15 | ); |
| 16 | |
| 17 | $Encode::Encoding{'MIME-Header'} = |
| 18 | bless { |
| 19 | %seed, |
| 20 | Name => 'MIME-Header', |
| 21 | } => __PACKAGE__; |
| 22 | |
| 23 | $Encode::Encoding{'MIME-B'} = |
| 24 | bless { |
| 25 | %seed, |
| 26 | decode_q => 0, |
| 27 | Name => 'MIME-B', |
| 28 | } => __PACKAGE__; |
| 29 | |
| 30 | $Encode::Encoding{'MIME-Q'} = |
| 31 | bless { |
| 32 | %seed, |
| 33 | decode_q => 1, |
| 34 | encode => 'Q', |
| 35 | Name => 'MIME-Q', |
| 36 | } => __PACKAGE__; |
| 37 | |
| 38 | use base qw(Encode::Encoding); |
| 39 | |
| 40 | sub needs_lines { 1 } |
| 41 | sub perlio_ok{ 0 }; |
| 42 | |
| 43 | sub decode($$;$){ |
| 44 | use utf8; |
| 45 | my ($obj, $str, $chk) = @_; |
| 46 | # zap spaces between encoded words |
| 47 | $str =~ s/\?=\s+=\?/\?==\?/gos; |
| 48 | # multi-line header to single line |
| 49 | $str =~ s/(:?\r|\n|\r\n)[ \t]//gos; |
| 50 | $str =~ |
| 51 | s{ |
| 52 | =\? # begin encoded word |
| 53 | ([0-9A-Za-z\-_]+) # charset (encoding) |
| 54 | \?([QqBb])\? # delimiter |
| 55 | (.*?) # Base64-encodede contents |
| 56 | \?= # end encoded word |
| 57 | }{ |
| 58 | if (uc($2) eq 'B'){ |
| 59 | $obj->{decode_b} or croak qq(MIME "B" unsupported); |
| 60 | decode_b($1, $3); |
| 61 | }elsif(uc($2) eq 'Q'){ |
| 62 | $obj->{decode_q} or croak qq(MIME "Q" unsupported); |
| 63 | decode_q($1, $3); |
| 64 | }else{ |
| 65 | croak qq(MIME "$2" encoding is nonexistent!); |
| 66 | } |
| 67 | }egox; |
| 68 | $_[1] = '' if $chk; |
| 69 | return $str; |
| 70 | } |
| 71 | |
| 72 | sub decode_b{ |
| 73 | my $enc = shift; |
| 74 | my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); |
| 75 | my $db64 = decode_base64(shift); |
| 76 | return $d->name eq 'utf8' ? |
| 77 | Encode::decode_utf8($db64) : $d->decode($db64, Encode::FB_PERLQQ); |
| 78 | } |
| 79 | |
| 80 | sub decode_q{ |
| 81 | my ($enc, $q) = @_; |
| 82 | my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); |
| 83 | $q =~ s/_/ /go; |
| 84 | $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego; |
| 85 | return $d->name eq 'utf8' ? |
| 86 | Encode::decode_utf8($q) : $d->decode($q, Encode::FB_PERLQQ); |
| 87 | } |
| 88 | |
| 89 | my $especials = |
| 90 | join('|' => |
| 91 | map {quotemeta(chr($_))} |
| 92 | unpack("C*", qq{()<>@,;:\"\'/[]?.=})); |
| 93 | |
| 94 | my $re_encoded_word = |
| 95 | qr{ |
| 96 | (?: |
| 97 | =\? # begin encoded word |
| 98 | (?:[0-9A-Za-z\-_]+) # charset (encoding) |
| 99 | \?(?:[QqBb])\? # delimiter |
| 100 | (?:.*?) # Base64-encodede contents |
| 101 | \?= # end encoded word |
| 102 | ) |
| 103 | }xo; |
| 104 | |
| 105 | my $re_especials = qr{$re_encoded_word|$especials}xo; |
| 106 | |
| 107 | sub encode($$;$){ |
| 108 | my ($obj, $str, $chk) = @_; |
| 109 | my @line = (); |
| 110 | for my $line (split /\r|\n|\r\n/o, $str){ |
| 111 | my (@word, @subline); |
| 112 | for my $word (split /($re_especials)/o, $line){ |
| 113 | if ($word =~ /[^\x00-\x7f]/o or $word =~ /^$re_encoded_word$/o){ |
| 114 | push @word, $obj->_encode($word); |
| 115 | }else{ |
| 116 | push @word, $word; |
| 117 | } |
| 118 | } |
| 119 | my $subline = ''; |
| 120 | for my $word (@word){ |
| 121 | use bytes (); |
| 122 | if (bytes::length($subline) + bytes::length($word) > $obj->{bpl}){ |
| 123 | push @subline, $subline; |
| 124 | $subline = ''; |
| 125 | } |
| 126 | $subline .= $word; |
| 127 | } |
| 128 | $subline and push @subline, $subline; |
| 129 | push @line, join("\n " => @subline); |
| 130 | } |
| 131 | $_[1] = '' if $chk; |
| 132 | return join("\n", @line); |
| 133 | } |
| 134 | |
| 135 | use constant HEAD => '=?UTF-8?'; |
| 136 | use constant TAIL => '?='; |
| 137 | use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, }; |
| 138 | |
| 139 | sub _encode{ |
| 140 | my ($o, $str) = @_; |
| 141 | my $enc = $o->{encode}; |
| 142 | my $llen = ($o->{bpl} - length(HEAD) - 2 - length(TAIL)); |
| 143 | # to coerce a floating-point arithmetics, the following contains |
| 144 | # .0 in numbers -- dankogai |
| 145 | $llen *= $enc eq 'B' ? 3.0/4.0 : 1.0/3.0; |
| 146 | my @result = (); |
| 147 | my $chunk = ''; |
| 148 | while(length(my $chr = substr($str, 0, 1, ''))){ |
| 149 | use bytes (); |
| 150 | if (bytes::length($chunk) + bytes::length($chr) > $llen){ |
| 151 | push @result, SINGLE->{$enc}($chunk); |
| 152 | $chunk = ''; |
| 153 | } |
| 154 | $chunk .= $chr; |
| 155 | } |
| 156 | $chunk and push @result, SINGLE->{$enc}($chunk); |
| 157 | return @result; |
| 158 | } |
| 159 | |
| 160 | sub _encode_b{ |
| 161 | HEAD . 'B?' . encode_base64(encode_utf8(shift), '') . TAIL; |
| 162 | } |
| 163 | |
| 164 | sub _encode_q{ |
| 165 | my $chunk = shift; |
| 166 | $chunk =~ s{ |
| 167 | ([^0-9A-Za-z]) |
| 168 | }{ |
| 169 | join("" => map {sprintf "=%02X", $_} unpack("C*", $1)) |
| 170 | }egox; |
| 171 | return decode_utf8(HEAD . 'Q?' . $chunk . TAIL); |
| 172 | } |
| 173 | |
| 174 | 1; |
| 175 | __END__ |
| 176 | |
| 177 | =head1 NAME |
| 178 | |
| 179 | Encode::MIME::Header -- MIME 'B' and 'Q' header encoding |
| 180 | |
| 181 | =head1 SYNOPSIS |
| 182 | |
| 183 | use Encode qw/encode decode/; |
| 184 | $utf8 = decode('MIME-Header', $header); |
| 185 | $header = encode('MIME-Header', $utf8); |
| 186 | |
| 187 | =head1 ABSTRACT |
| 188 | |
| 189 | This module implements RFC 2047 Mime Header Encoding. There are 3 |
| 190 | variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>. The |
| 191 | difference is described below |
| 192 | |
| 193 | decode() encode() |
| 194 | ---------------------------------------------- |
| 195 | MIME-Header Both B and Q =?UTF-8?B?....?= |
| 196 | MIME-B B only; Q croaks =?UTF-8?B?....?= |
| 197 | MIME-Q Q only; B croaks =?UTF-8?Q?....?= |
| 198 | |
| 199 | =head1 DESCRIPTION |
| 200 | |
| 201 | When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD> |
| 202 | is extracted and decoded for I<X> encoding (B for Base64, Q for |
| 203 | Quoted-Printable). Then the decoded chunk is fed to |
| 204 | decode(I<encoding>). So long as I<encoding> is supported by Encode, |
| 205 | any source encoding is fine. |
| 206 | |
| 207 | When you encode, it just encodes UTF-8 string with I<X> encoding then |
| 208 | quoted with =?UTF-8?I<X>?....?= . The parts that RFC 2047 forbids to |
| 209 | encode are left as is and long lines are folded within 76 bytes per |
| 210 | line. |
| 211 | |
| 212 | =head1 BUGS |
| 213 | |
| 214 | It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP? |
| 215 | and =?ISO-8859-1?= but that makes the implementation too complicated. |
| 216 | These days major mail agents all support =?UTF-8? so I think it is |
| 217 | just good enough. |
| 218 | |
| 219 | Due to popular demand, 'MIME-Header-ISO_2022_JP' was introduced by |
| 220 | Makamaka. Thre are still too many MUAs especially cellular phone |
| 221 | handsets which does not grok UTF-8. |
| 222 | |
| 223 | =head1 SEE ALSO |
| 224 | |
| 225 | L<Encode> |
| 226 | |
| 227 | RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other |
| 228 | locations. |
| 229 | |
| 230 | =cut |