| 1 | ##---------------------------------------------------------------------------## |
| 2 | ## File: |
| 3 | ## $Id: mhtxtplain.pl,v 2.24 2002/10/10 22:27:19 ehood Exp $ |
| 4 | ## Author: |
| 5 | ## Earl Hood mhonarc@mhonarc.org |
| 6 | ## Description: |
| 7 | ## Library defines routine to filter text/plain body parts to HTML |
| 8 | ## for MHonArc. |
| 9 | ## Filter routine can be registered with the following: |
| 10 | ## <MIMEFILTERS> |
| 11 | ## text/plain:m2h_text_plain'filter:mhtxtplain.pl |
| 12 | ## </MIMEFILTERS> |
| 13 | ##---------------------------------------------------------------------------## |
| 14 | ## MHonArc -- Internet mail-to-HTML converter |
| 15 | ## Copyright (C) 1995-2001 Earl Hood, mhonarc@mhonarc.org |
| 16 | ## |
| 17 | ## This program is free software; you can redistribute it and/or modify |
| 18 | ## it under the terms of the GNU General Public License as published by |
| 19 | ## the Free Software Foundation; either version 2 of the License, or |
| 20 | ## (at your option) any later version. |
| 21 | ## |
| 22 | ## This program is distributed in the hope that it will be useful, |
| 23 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 24 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 25 | ## GNU General Public License for more details. |
| 26 | ## |
| 27 | ## You should have received a copy of the GNU General Public License |
| 28 | ## along with this program; if not, write to the Free Software |
| 29 | ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA |
| 30 | ## 02111-1307, USA |
| 31 | ##---------------------------------------------------------------------------## |
| 32 | |
| 33 | package m2h_text_plain; |
| 34 | |
| 35 | require 'readmail.pl'; |
| 36 | |
| 37 | $Url = '(http://|https://|ftp://|afs://|wais://|telnet://|ldap://' . |
| 38 | '|gopher://|news:|nntp:|mid:|cid:|mailto:|prospero:)'; |
| 39 | $UrlExp = $Url . q/[^\s\(\)\|<>"']*[^\.?!;,"'\|\[\]\(\)\s<>]/; |
| 40 | $HUrlExp = $Url . q/(?:&(?![gl]t;)|[^\s\(\)\|<>"'\&])+/ . |
| 41 | q/[^\.?!;,"'\|\[\]\(\)\s<>\&]/; |
| 42 | $QuoteChars = '[>\|\]+:]'; |
| 43 | $HQuoteChars = '>|[\|\]+:]'; |
| 44 | |
| 45 | $StartFlowedQuote = |
| 46 | '<blockquote style="border-left: #0000FF solid 0.1em; '. |
| 47 | 'margin-left: 0.0em; padding-left: 1.0em">'; |
| 48 | $EndFlowedQuote = "</blockquote>"; |
| 49 | |
| 50 | ##---------------------------------------------------------------------------## |
| 51 | ## Text/plain filter for mhonarc. The following filter arguments |
| 52 | ## are recognized ($args): |
| 53 | ## |
| 54 | ## asis=set1:set2:... |
| 55 | ## Colon separated lists of charsets to leave as-is. |
| 56 | ## Only HTML special characters will be converted into |
| 57 | ## entities. The default value is "us-ascii:iso-8859-1". |
| 58 | ## |
| 59 | ## attachcheck Honor attachment disposition. By default, |
| 60 | ## all text/plain data is displayed inline on |
| 61 | ## the message page. If attachcheck is specified |
| 62 | ## and Content-Disposition specifies the data as |
| 63 | ## an attachment, the data is saved to a file |
| 64 | ## with a link to it from the message page. |
| 65 | ## |
| 66 | ## default=set Default charset to use if not set. |
| 67 | ## |
| 68 | ## inlineexts="ext1,ext2,..." |
| 69 | ## A comma separated list of message specified filename |
| 70 | ## extensions to treat as inline data. |
| 71 | ## Applicable only when uudecode options specified. |
| 72 | ## |
| 73 | ## htmlcheck Check if message is actually an HTML message |
| 74 | ## (to get around abhorrent MUAs). The message |
| 75 | ## is treated as HTML if the first non-whitespace |
| 76 | ## data looks like the start of an HTML document. |
| 77 | ## |
| 78 | ## keepspace Preserve whitespace if nonfixed |
| 79 | ## |
| 80 | ## nourl Do hyperlink URLs |
| 81 | ## |
| 82 | ## nonfixed Use normal typeface |
| 83 | ## |
| 84 | ## maxwidth=# Set the maximum width of lines. Lines exceeding |
| 85 | ## the maxwidth will be broken up across multiple lines. |
| 86 | ## |
| 87 | ## quote Italicize quoted message text |
| 88 | ## |
| 89 | ## target=name Set TARGET attribute for links if converting URLs |
| 90 | ## to links. Defaults to _top. |
| 91 | ## |
| 92 | ## usename Use filename specified in uuencoded data when |
| 93 | ## converting uuencoded data. This option is only |
| 94 | ## applicable of uudecode is specified. |
| 95 | ## |
| 96 | ## uudecode Decoded any embedded uuencoded data. |
| 97 | ## |
| 98 | ## All arguments should be separated by at least one space |
| 99 | ## |
| 100 | sub filter { |
| 101 | my($fields, $data, $isdecode, $args) = @_; |
| 102 | local($_); |
| 103 | |
| 104 | ## Parse arguments |
| 105 | $args = "" unless defined($args); |
| 106 | |
| 107 | ## Check if content-disposition should be checked |
| 108 | if ($args =~ /\battachcheck\b/i) { |
| 109 | my($disp, $nameparm) = readmail::MAILhead_get_disposition($fields); |
| 110 | if ($disp =~ /\battachment\b/i) { |
| 111 | require 'mhexternal.pl'; |
| 112 | return (m2h_external::filter( |
| 113 | $fields, $data, $isdecode, |
| 114 | readmail::get_filter_args('m2h_external::filter'))); |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | ## Check if decoding uuencoded data. The implementation chosen here |
| 119 | ## for decoding uuencoded data was done so when uudecode is not |
| 120 | ## specified, there is no extra overhead (besides the $args check for |
| 121 | ## uudecode). However, when uudecode is specified, more overhead may |
| 122 | ## exist over other potential implementations. |
| 123 | ## I.e. We only try to penalize performance when uudecode is specified. |
| 124 | if ($args =~ s/\buudecode\b//ig) { |
| 125 | # $args has uudecode stripped out for recursive calls |
| 126 | |
| 127 | # Make sure we have needed routines |
| 128 | my $decoder = readmail::load_decoder("uuencode"); |
| 129 | if (!defined($decoder) || !defined(&$decoder)) { |
| 130 | require 'base64.pl'; |
| 131 | $decoder = \&base64::uudecode; |
| 132 | } |
| 133 | require 'mhmimetypes.pl'; |
| 134 | |
| 135 | # Grab any filename extensions that imply inlining |
| 136 | my $inlineexts = ''; |
| 137 | if ($args =~ /\binlineexts=(\S+)/) { |
| 138 | $inlineexts = ',' . lc($1) . ','; |
| 139 | $inlineexts =~ s/['"]//g; |
| 140 | } |
| 141 | my $usename = $args =~ /\busename\b/; |
| 142 | |
| 143 | my($pdata); # have to use local() since typeglobs used |
| 144 | my($inext, $uddata, $file, $urlfile); |
| 145 | my @files = ( ); |
| 146 | my $ret = ""; |
| 147 | my $i = 0; |
| 148 | |
| 149 | # <CR><LF> => <LF> to make parsing easier |
| 150 | $$data =~ s/\r\n/\n/g; |
| 151 | |
| 152 | # Split on uuencoded data. For text portions, recursively call |
| 153 | # filter to convert text data: makes it easier to handle all |
| 154 | # the various formatting options. |
| 155 | foreach $pdata |
| 156 | (split(/^(begin\s+\d\d\d\s+[^\n]+\n[!-M].*?\nend\n)/sm, |
| 157 | $$data)) { |
| 158 | if ($i % 2) { # uuencoded data |
| 159 | # extract filename extension |
| 160 | ($file) = $pdata =~ /^begin\s+\d\d\d\s+([^\n]+)/; |
| 161 | if ($file =~ /\.(\w+)$/) { $inext = $1; } else { $inext = ""; } |
| 162 | |
| 163 | # decode data |
| 164 | $uddata = &$decoder($pdata); |
| 165 | |
| 166 | # save to file |
| 167 | if (readmail::MAILis_excluded('application/octet-stream')) { |
| 168 | $ret .= &$readmail::ExcludedPartFunc($file); |
| 169 | } else { |
| 170 | push(@files, |
| 171 | mhonarc::write_attachment( |
| 172 | 'application/octet-stream', \$uddata, '', |
| 173 | ($usename?$file:''), $inext)); |
| 174 | $urlfile = mhonarc::htmlize($files[$#files]); |
| 175 | |
| 176 | # create link to file |
| 177 | if (index($inlineexts, ','.lc($inext).',') >= $[) { |
| 178 | $ret .= qq|<a href="$urlfile"><img src="$urlfile">| . |
| 179 | qq|</a><br>\n|; |
| 180 | } else { |
| 181 | $ret .= qq|<a href="$urlfile">| . |
| 182 | mhonarc::htmlize($file) . qq|</a><br>\n|; |
| 183 | } |
| 184 | } |
| 185 | |
| 186 | } elsif ($pdata =~ /\S/) { # plain text |
| 187 | my(@subret) = filter($fields, \$pdata, $isdecode, $args); |
| 188 | $ret .= shift @subret; |
| 189 | push(@files, @subret); |
| 190 | } else { |
| 191 | # Make sure readmail thinks we processed |
| 192 | $ret .= " "; |
| 193 | } |
| 194 | ++$i; |
| 195 | } |
| 196 | |
| 197 | ## Done with uudecode |
| 198 | return ($ret, @files); |
| 199 | } |
| 200 | |
| 201 | |
| 202 | ## Check for HTML data if requested |
| 203 | if ($args =~ s/\bhtmlcheck\b//i && |
| 204 | $$data =~ /\A\s*<(?:html\b|x-html\b|!doctype\s+html\s)/i) { |
| 205 | if (readmail::MAILis_excluded('text/html')) { |
| 206 | return (&$readmail::ExcludedPartFunc('text/plain HTML')); |
| 207 | } |
| 208 | my $html_filter = readmail::load_filter('text/html'); |
| 209 | if (defined($html_filter) && defined(&$html_filter)) { |
| 210 | return (&$html_filter($fields, $data, $isdecode, |
| 211 | readmail::get_filter_args( |
| 212 | 'text/html', 'text/*', $html_filter))); |
| 213 | } else { |
| 214 | require 'mhtxthtml.pl'; |
| 215 | return (m2h_text_html::filter($fields, $data, $isdecode, |
| 216 | readmail::get_filter_args( |
| 217 | 'text/html', 'text/*', 'm2h_text_html::filter'))); |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | my($charset, $nourl, $doquote, $igncharset, $nonfixed, $textformat, |
| 222 | $keepspace, $maxwidth, $target, $defset, $xhtml); |
| 223 | my(%asis) = ( ); |
| 224 | |
| 225 | $nourl = ($mhonarc::NOURL || ($args =~ /\bnourl\b/i)); |
| 226 | $doquote = ($args =~ /\bquote\b/i); |
| 227 | $nonfixed = ($args =~ /\bnonfixed\b/i); |
| 228 | $keepspace = ($args =~ /\bkeepspace\b/i); |
| 229 | if ($args =~ /\bmaxwidth=(\d+)/i) { $maxwidth = $1; } |
| 230 | else { $maxwidth = 0; } |
| 231 | if ($args =~ /\bdefault=(\S+)/i) { $defset = lc $1; } |
| 232 | else { $defset = 'us-ascii'; } |
| 233 | $target = ""; |
| 234 | if ($args =~ /\btarget="([^"]+)"/i) { $target = $1; } |
| 235 | elsif ($args =~ /\btarget=(\S+)/i) { $target = $1; } |
| 236 | $target =~ s/['"]//g; |
| 237 | if ($target) { |
| 238 | $target = qq/target="$target"/; |
| 239 | } |
| 240 | $defset =~ s/['"\s]//g; |
| 241 | |
| 242 | ## Grab charset parameter (if defined) |
| 243 | if ( defined($fields->{'content-type'}[0]) and |
| 244 | $fields->{'content-type'}[0] =~ /\bcharset\s*=\s*([^\s;]+)/i ) { |
| 245 | $charset = lc $1; |
| 246 | $charset =~ s/['";\s]//g; |
| 247 | } else { |
| 248 | $charset = $defset; |
| 249 | } |
| 250 | ## Grab format parameter (if defined) |
| 251 | if ( defined($fields->{'content-type'}[0]) and |
| 252 | $fields->{'content-type'}[0] =~ /\bformat\s*=\s*([^\s;]+)/i ) { |
| 253 | $textformat = lc $1; |
| 254 | $textformat =~ s/['";\s]//g; |
| 255 | } else { |
| 256 | $textformat = "fixed"; |
| 257 | } |
| 258 | |
| 259 | ## Check if certain charsets should be left alone |
| 260 | if ($args =~ /\basis=(\S+)/i) { |
| 261 | my $t = lc $1; $t =~ s/['"]//g; |
| 262 | local($_); foreach (split(':', $t)) { $asis{$_} = 1; } |
| 263 | } |
| 264 | |
| 265 | ## Check MIMECharSetConverters if charset should be left alone |
| 266 | my($charcnv, $real_charset_name) = |
| 267 | readmail::MAILload_charset_converter($charset); |
| 268 | if (defined($charcnv) && $charcnv eq '-decode-') { |
| 269 | $asis{$charset} = 1; |
| 270 | } |
| 271 | |
| 272 | ## Check if max-width set |
| 273 | if ($maxwidth && $textformat eq 'fixed') { |
| 274 | $$data =~ s/^(.*)$/&break_line($1, $maxwidth)/gem; |
| 275 | } |
| 276 | |
| 277 | ## Convert data according to charset |
| 278 | if (!$asis{$charset}) { |
| 279 | # Registered in CHARSETCONVERTERS |
| 280 | if (defined($charcnv) && defined(&$charcnv)) { |
| 281 | $$data = &$charcnv($$data, $real_charset_name); |
| 282 | |
| 283 | # Other |
| 284 | } else { |
| 285 | warn qq/\n/, |
| 286 | qq/Warning: Unrecognized character set: $charset\n/, |
| 287 | qq/ Message-Id: <$mhonarc::MHAmsgid>\n/, |
| 288 | qq/ Message Number: $mhonarc::MHAmsgnum\n/; |
| 289 | esc_chars_inplace($data); |
| 290 | } |
| 291 | |
| 292 | } else { |
| 293 | esc_chars_inplace($data); |
| 294 | } |
| 295 | |
| 296 | if ($textformat eq 'flowed') { |
| 297 | # Initial code for format=flowed contributed by Ken Hirsch (May 2002). |
| 298 | # text/plain; format=flowed defined in RFC2646 |
| 299 | |
| 300 | my $currdepth = 0; |
| 301 | my $ret=''; |
| 302 | $$data =~ s!^</?x-flowed>\r?\n>!!mg; |
| 303 | while (length($$data)) { |
| 304 | $$data =~ /^((?:>)*)/; |
| 305 | my $qd = $1; |
| 306 | if ($$data =~ s/^(.*(?:(?:\n|\r\n?)$qd(?!>).*)*\n?)//) { |
| 307 | # divide message into chunks by "quote-depth", |
| 308 | # which is the number of leading > signs |
| 309 | my $chunk = $1; |
| 310 | $chunk =~ s/^$qd ?//mg; # N.B. also takes care of |
| 311 | # space-stuffing |
| 312 | $chunk =~ s/^-- $/--/mg; # special case for '-- ' |
| 313 | |
| 314 | if ($chunk =~ / \r?\n/) { |
| 315 | # Treat this chunk as format=flowed |
| 316 | # Lines that end with spaces are |
| 317 | # considered to have soft line breaks. |
| 318 | # Lines that end with no spaces are |
| 319 | # considered to have hard line breaks. |
| 320 | # XXX: Negative look-behind assertion not supported |
| 321 | # on older versions of Perl 5 (<5.6) |
| 322 | #$chunk =~ s/(?<! )(\r?\n|\Z)/<br>$1/g; |
| 323 | $chunk =~ s/(^|[^ ])(\r?\n|\Z)/$1<br>$2/mg; |
| 324 | |
| 325 | } else { |
| 326 | # Treat this chunk as format=fixed |
| 327 | if ($nonfixed) { |
| 328 | $chunk =~ s/(\r?\n)/<br>$1/g; |
| 329 | if ($keepspace) { |
| 330 | $chunk =~ s/^(.*)$/&preserve_space($1)/gem; |
| 331 | } |
| 332 | } else { |
| 333 | $chunk = "<pre>" . $chunk . "</pre>\n"; |
| 334 | } |
| 335 | } |
| 336 | my $newdepth = length($qd)/length('>'); |
| 337 | if ($currdepth < $newdepth) { |
| 338 | $chunk = $StartFlowedQuote x |
| 339 | ($newdepth - $currdepth) . $chunk; |
| 340 | } elsif ($currdepth > $newdepth) { |
| 341 | $chunk = $EndFlowedQuote x |
| 342 | ($currdepth - $newdepth) . $chunk; |
| 343 | } |
| 344 | $currdepth = $newdepth; |
| 345 | $ret .= $chunk; |
| 346 | |
| 347 | } else { |
| 348 | # The above regex should always match, but just in case... |
| 349 | warn qq/\n/, |
| 350 | qq/Warning: Dequoting problem with format=flowed data\n/, |
| 351 | qq/ Message-Id: <$MHAmsgid>\n/, |
| 352 | qq/ Message Number: $MHAmsgnum\n/; |
| 353 | $ret .= $$data; |
| 354 | last; |
| 355 | } |
| 356 | } |
| 357 | if ($currdepth > 0) { |
| 358 | $ret .= $EndFlowedQuote x $currdepth; |
| 359 | } |
| 360 | |
| 361 | ## Post-processing cleanup: makes things look nicer |
| 362 | $ret =~ s/<br><\/blockquote>/<\/blockquote>/g; |
| 363 | $ret =~ s/<\/blockquote><br>/<\/blockquote>/g; |
| 364 | |
| 365 | $$data = $ret; |
| 366 | |
| 367 | } else { |
| 368 | ## Check for quoting |
| 369 | if ($doquote) { |
| 370 | $$data =~ s@^( ?${HQuoteChars})(.*)$@$1<i>$2</i>@gom; |
| 371 | } |
| 372 | |
| 373 | ## Check if using nonfixed font |
| 374 | if ($nonfixed) { |
| 375 | $$data =~ s/(\r?\n)/<br>$1/g; |
| 376 | if ($keepspace) { |
| 377 | $$data =~ s/^(.*)$/&preserve_space($1)/gem; |
| 378 | } |
| 379 | } else { |
| 380 | $$data = "<pre>" . $$data . "</pre>\n"; |
| 381 | } |
| 382 | } |
| 383 | |
| 384 | ## Convert URLs to hyperlinks |
| 385 | $$data =~ s@($HUrlExp)@<a $target href="$1">$1</a>@gio |
| 386 | unless $nourl; |
| 387 | |
| 388 | ($$data); |
| 389 | } |
| 390 | |
| 391 | ##---------------------------------------------------------------------------## |
| 392 | |
| 393 | sub esc_chars_inplace { |
| 394 | my($foo) = shift; |
| 395 | $$foo =~ s/&/&/g; |
| 396 | $$foo =~ s/</</g; |
| 397 | $$foo =~ s/>/>/g; |
| 398 | $$foo =~ s/"/"/g; |
| 399 | 1; |
| 400 | } |
| 401 | |
| 402 | ##---------------------------------------------------------------------------## |
| 403 | |
| 404 | sub preserve_space { |
| 405 | my($str) = shift; |
| 406 | |
| 407 | 1 while |
| 408 | $str =~ s/^([^\t]*)(\t+)/$1 . ' ' x (length($2) * 8 - length($1) % 8)/e; |
| 409 | $str =~ s/ /\ /g; |
| 410 | $str; |
| 411 | } |
| 412 | |
| 413 | ##---------------------------------------------------------------------------## |
| 414 | |
| 415 | sub break_line { |
| 416 | my($str) = shift; |
| 417 | my($width) = shift; |
| 418 | my($q, $new) = ('', ''); |
| 419 | my($try, $trywidth, $len); |
| 420 | |
| 421 | ## Translate tabs to spaces |
| 422 | 1 while |
| 423 | $str =~ s/^([^\t]*)(\t+)/$1 . ' ' x (length($2) * 8 - length($1) % 8)/e; |
| 424 | |
| 425 | ## Do nothing if str <= width |
| 426 | return $str if length($str) <= $width; |
| 427 | |
| 428 | ## See if str begins with a quote char |
| 429 | if ($str =~ s/^( ?$QuoteChars)//o) { |
| 430 | $q = $1; |
| 431 | --$width; |
| 432 | } |
| 433 | |
| 434 | ## Create new string by breaking up str |
| 435 | while ($str ne "") { |
| 436 | |
| 437 | # If $str less than width, break out |
| 438 | if (length($str) <= $width) { |
| 439 | $new .= $q . $str; |
| 440 | last; |
| 441 | } |
| 442 | |
| 443 | # handle case where no-whitespace line larger than width |
| 444 | if (($str =~ /^(\S+)/) && (($len = length($1)) >= $width)) { |
| 445 | $new .= $q . $1; |
| 446 | substr($str, 0, $len) = ""; |
| 447 | next; |
| 448 | } |
| 449 | |
| 450 | # Break string at whitespace |
| 451 | $try = ''; |
| 452 | $trywidth = $width; |
| 453 | $try = substr($str, 0, $trywidth); |
| 454 | if ($try =~ /(\S+)$/) { |
| 455 | $trywidth -= length($1); |
| 456 | $new .= $q . substr($str, 0, $trywidth); |
| 457 | } else { |
| 458 | $new .= $q . $try; |
| 459 | } |
| 460 | substr($str, 0, $trywidth) = ''; |
| 461 | |
| 462 | } continue { |
| 463 | $new .= "\n" if $str; |
| 464 | } |
| 465 | $new; |
| 466 | } |
| 467 | |
| 468 | ##---------------------------------------------------------------------------## |
| 469 | 1; |