Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | ##---------------------------------------------------------------------------## |
2 | ## File: | |
3 | ## $Id: mhtxtplain.pl,v 2.24 2002/10/10 22:27:19 ehood Exp $ | |
4 | ## Author: | |
5 | ## Earl Hood mhonarc@mhonarc.org | |
6 | ## Description: | |
7 | ## Library defines routine to filter text/plain body parts to HTML | |
8 | ## for MHonArc. | |
9 | ## Filter routine can be registered with the following: | |
10 | ## <MIMEFILTERS> | |
11 | ## text/plain:m2h_text_plain'filter:mhtxtplain.pl | |
12 | ## </MIMEFILTERS> | |
13 | ##---------------------------------------------------------------------------## | |
14 | ## MHonArc -- Internet mail-to-HTML converter | |
15 | ## Copyright (C) 1995-2001 Earl Hood, mhonarc@mhonarc.org | |
16 | ## | |
17 | ## This program is free software; you can redistribute it and/or modify | |
18 | ## it under the terms of the GNU General Public License as published by | |
19 | ## the Free Software Foundation; either version 2 of the License, or | |
20 | ## (at your option) any later version. | |
21 | ## | |
22 | ## This program is distributed in the hope that it will be useful, | |
23 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
24 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
25 | ## GNU General Public License for more details. | |
26 | ## | |
27 | ## You should have received a copy of the GNU General Public License | |
28 | ## along with this program; if not, write to the Free Software | |
29 | ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA | |
30 | ## 02111-1307, USA | |
31 | ##---------------------------------------------------------------------------## | |
32 | ||
33 | package m2h_text_plain; | |
34 | ||
35 | require 'readmail.pl'; | |
36 | ||
37 | $Url = '(http://|https://|ftp://|afs://|wais://|telnet://|ldap://' . | |
38 | '|gopher://|news:|nntp:|mid:|cid:|mailto:|prospero:)'; | |
39 | $UrlExp = $Url . q/[^\s\(\)\|<>"']*[^\.?!;,"'\|\[\]\(\)\s<>]/; | |
40 | $HUrlExp = $Url . q/(?:&(?![gl]t;)|[^\s\(\)\|<>"'\&])+/ . | |
41 | q/[^\.?!;,"'\|\[\]\(\)\s<>\&]/; | |
42 | $QuoteChars = '[>\|\]+:]'; | |
43 | $HQuoteChars = '>|[\|\]+:]'; | |
44 | ||
45 | $StartFlowedQuote = | |
46 | '<blockquote style="border-left: #0000FF solid 0.1em; '. | |
47 | 'margin-left: 0.0em; padding-left: 1.0em">'; | |
48 | $EndFlowedQuote = "</blockquote>"; | |
49 | ||
50 | ##---------------------------------------------------------------------------## | |
51 | ## Text/plain filter for mhonarc. The following filter arguments | |
52 | ## are recognized ($args): | |
53 | ## | |
54 | ## asis=set1:set2:... | |
55 | ## Colon separated lists of charsets to leave as-is. | |
56 | ## Only HTML special characters will be converted into | |
57 | ## entities. The default value is "us-ascii:iso-8859-1". | |
58 | ## | |
59 | ## attachcheck Honor attachment disposition. By default, | |
60 | ## all text/plain data is displayed inline on | |
61 | ## the message page. If attachcheck is specified | |
62 | ## and Content-Disposition specifies the data as | |
63 | ## an attachment, the data is saved to a file | |
64 | ## with a link to it from the message page. | |
65 | ## | |
66 | ## default=set Default charset to use if not set. | |
67 | ## | |
68 | ## inlineexts="ext1,ext2,..." | |
69 | ## A comma separated list of message specified filename | |
70 | ## extensions to treat as inline data. | |
71 | ## Applicable only when uudecode options specified. | |
72 | ## | |
73 | ## htmlcheck Check if message is actually an HTML message | |
74 | ## (to get around abhorrent MUAs). The message | |
75 | ## is treated as HTML if the first non-whitespace | |
76 | ## data looks like the start of an HTML document. | |
77 | ## | |
78 | ## keepspace Preserve whitespace if nonfixed | |
79 | ## | |
80 | ## nourl Do hyperlink URLs | |
81 | ## | |
82 | ## nonfixed Use normal typeface | |
83 | ## | |
84 | ## maxwidth=# Set the maximum width of lines. Lines exceeding | |
85 | ## the maxwidth will be broken up across multiple lines. | |
86 | ## | |
87 | ## quote Italicize quoted message text | |
88 | ## | |
89 | ## target=name Set TARGET attribute for links if converting URLs | |
90 | ## to links. Defaults to _top. | |
91 | ## | |
92 | ## usename Use filename specified in uuencoded data when | |
93 | ## converting uuencoded data. This option is only | |
94 | ## applicable of uudecode is specified. | |
95 | ## | |
96 | ## uudecode Decoded any embedded uuencoded data. | |
97 | ## | |
98 | ## All arguments should be separated by at least one space | |
99 | ## | |
100 | sub filter { | |
101 | my($fields, $data, $isdecode, $args) = @_; | |
102 | local($_); | |
103 | ||
104 | ## Parse arguments | |
105 | $args = "" unless defined($args); | |
106 | ||
107 | ## Check if content-disposition should be checked | |
108 | if ($args =~ /\battachcheck\b/i) { | |
109 | my($disp, $nameparm) = readmail::MAILhead_get_disposition($fields); | |
110 | if ($disp =~ /\battachment\b/i) { | |
111 | require 'mhexternal.pl'; | |
112 | return (m2h_external::filter( | |
113 | $fields, $data, $isdecode, | |
114 | readmail::get_filter_args('m2h_external::filter'))); | |
115 | } | |
116 | } | |
117 | ||
118 | ## Check if decoding uuencoded data. The implementation chosen here | |
119 | ## for decoding uuencoded data was done so when uudecode is not | |
120 | ## specified, there is no extra overhead (besides the $args check for | |
121 | ## uudecode). However, when uudecode is specified, more overhead may | |
122 | ## exist over other potential implementations. | |
123 | ## I.e. We only try to penalize performance when uudecode is specified. | |
124 | if ($args =~ s/\buudecode\b//ig) { | |
125 | # $args has uudecode stripped out for recursive calls | |
126 | ||
127 | # Make sure we have needed routines | |
128 | my $decoder = readmail::load_decoder("uuencode"); | |
129 | if (!defined($decoder) || !defined(&$decoder)) { | |
130 | require 'base64.pl'; | |
131 | $decoder = \&base64::uudecode; | |
132 | } | |
133 | require 'mhmimetypes.pl'; | |
134 | ||
135 | # Grab any filename extensions that imply inlining | |
136 | my $inlineexts = ''; | |
137 | if ($args =~ /\binlineexts=(\S+)/) { | |
138 | $inlineexts = ',' . lc($1) . ','; | |
139 | $inlineexts =~ s/['"]//g; | |
140 | } | |
141 | my $usename = $args =~ /\busename\b/; | |
142 | ||
143 | my($pdata); # have to use local() since typeglobs used | |
144 | my($inext, $uddata, $file, $urlfile); | |
145 | my @files = ( ); | |
146 | my $ret = ""; | |
147 | my $i = 0; | |
148 | ||
149 | # <CR><LF> => <LF> to make parsing easier | |
150 | $$data =~ s/\r\n/\n/g; | |
151 | ||
152 | # Split on uuencoded data. For text portions, recursively call | |
153 | # filter to convert text data: makes it easier to handle all | |
154 | # the various formatting options. | |
155 | foreach $pdata | |
156 | (split(/^(begin\s+\d\d\d\s+[^\n]+\n[!-M].*?\nend\n)/sm, | |
157 | $$data)) { | |
158 | if ($i % 2) { # uuencoded data | |
159 | # extract filename extension | |
160 | ($file) = $pdata =~ /^begin\s+\d\d\d\s+([^\n]+)/; | |
161 | if ($file =~ /\.(\w+)$/) { $inext = $1; } else { $inext = ""; } | |
162 | ||
163 | # decode data | |
164 | $uddata = &$decoder($pdata); | |
165 | ||
166 | # save to file | |
167 | if (readmail::MAILis_excluded('application/octet-stream')) { | |
168 | $ret .= &$readmail::ExcludedPartFunc($file); | |
169 | } else { | |
170 | push(@files, | |
171 | mhonarc::write_attachment( | |
172 | 'application/octet-stream', \$uddata, '', | |
173 | ($usename?$file:''), $inext)); | |
174 | $urlfile = mhonarc::htmlize($files[$#files]); | |
175 | ||
176 | # create link to file | |
177 | if (index($inlineexts, ','.lc($inext).',') >= $[) { | |
178 | $ret .= qq|<a href="$urlfile"><img src="$urlfile">| . | |
179 | qq|</a><br>\n|; | |
180 | } else { | |
181 | $ret .= qq|<a href="$urlfile">| . | |
182 | mhonarc::htmlize($file) . qq|</a><br>\n|; | |
183 | } | |
184 | } | |
185 | ||
186 | } elsif ($pdata =~ /\S/) { # plain text | |
187 | my(@subret) = filter($fields, \$pdata, $isdecode, $args); | |
188 | $ret .= shift @subret; | |
189 | push(@files, @subret); | |
190 | } else { | |
191 | # Make sure readmail thinks we processed | |
192 | $ret .= " "; | |
193 | } | |
194 | ++$i; | |
195 | } | |
196 | ||
197 | ## Done with uudecode | |
198 | return ($ret, @files); | |
199 | } | |
200 | ||
201 | ||
202 | ## Check for HTML data if requested | |
203 | if ($args =~ s/\bhtmlcheck\b//i && | |
204 | $$data =~ /\A\s*<(?:html\b|x-html\b|!doctype\s+html\s)/i) { | |
205 | if (readmail::MAILis_excluded('text/html')) { | |
206 | return (&$readmail::ExcludedPartFunc('text/plain HTML')); | |
207 | } | |
208 | my $html_filter = readmail::load_filter('text/html'); | |
209 | if (defined($html_filter) && defined(&$html_filter)) { | |
210 | return (&$html_filter($fields, $data, $isdecode, | |
211 | readmail::get_filter_args( | |
212 | 'text/html', 'text/*', $html_filter))); | |
213 | } else { | |
214 | require 'mhtxthtml.pl'; | |
215 | return (m2h_text_html::filter($fields, $data, $isdecode, | |
216 | readmail::get_filter_args( | |
217 | 'text/html', 'text/*', 'm2h_text_html::filter'))); | |
218 | } | |
219 | } | |
220 | ||
221 | my($charset, $nourl, $doquote, $igncharset, $nonfixed, $textformat, | |
222 | $keepspace, $maxwidth, $target, $defset, $xhtml); | |
223 | my(%asis) = ( ); | |
224 | ||
225 | $nourl = ($mhonarc::NOURL || ($args =~ /\bnourl\b/i)); | |
226 | $doquote = ($args =~ /\bquote\b/i); | |
227 | $nonfixed = ($args =~ /\bnonfixed\b/i); | |
228 | $keepspace = ($args =~ /\bkeepspace\b/i); | |
229 | if ($args =~ /\bmaxwidth=(\d+)/i) { $maxwidth = $1; } | |
230 | else { $maxwidth = 0; } | |
231 | if ($args =~ /\bdefault=(\S+)/i) { $defset = lc $1; } | |
232 | else { $defset = 'us-ascii'; } | |
233 | $target = ""; | |
234 | if ($args =~ /\btarget="([^"]+)"/i) { $target = $1; } | |
235 | elsif ($args =~ /\btarget=(\S+)/i) { $target = $1; } | |
236 | $target =~ s/['"]//g; | |
237 | if ($target) { | |
238 | $target = qq/target="$target"/; | |
239 | } | |
240 | $defset =~ s/['"\s]//g; | |
241 | ||
242 | ## Grab charset parameter (if defined) | |
243 | if ( defined($fields->{'content-type'}[0]) and | |
244 | $fields->{'content-type'}[0] =~ /\bcharset\s*=\s*([^\s;]+)/i ) { | |
245 | $charset = lc $1; | |
246 | $charset =~ s/['";\s]//g; | |
247 | } else { | |
248 | $charset = $defset; | |
249 | } | |
250 | ## Grab format parameter (if defined) | |
251 | if ( defined($fields->{'content-type'}[0]) and | |
252 | $fields->{'content-type'}[0] =~ /\bformat\s*=\s*([^\s;]+)/i ) { | |
253 | $textformat = lc $1; | |
254 | $textformat =~ s/['";\s]//g; | |
255 | } else { | |
256 | $textformat = "fixed"; | |
257 | } | |
258 | ||
259 | ## Check if certain charsets should be left alone | |
260 | if ($args =~ /\basis=(\S+)/i) { | |
261 | my $t = lc $1; $t =~ s/['"]//g; | |
262 | local($_); foreach (split(':', $t)) { $asis{$_} = 1; } | |
263 | } | |
264 | ||
265 | ## Check MIMECharSetConverters if charset should be left alone | |
266 | my($charcnv, $real_charset_name) = | |
267 | readmail::MAILload_charset_converter($charset); | |
268 | if (defined($charcnv) && $charcnv eq '-decode-') { | |
269 | $asis{$charset} = 1; | |
270 | } | |
271 | ||
272 | ## Check if max-width set | |
273 | if ($maxwidth && $textformat eq 'fixed') { | |
274 | $$data =~ s/^(.*)$/&break_line($1, $maxwidth)/gem; | |
275 | } | |
276 | ||
277 | ## Convert data according to charset | |
278 | if (!$asis{$charset}) { | |
279 | # Registered in CHARSETCONVERTERS | |
280 | if (defined($charcnv) && defined(&$charcnv)) { | |
281 | $$data = &$charcnv($$data, $real_charset_name); | |
282 | ||
283 | # Other | |
284 | } else { | |
285 | warn qq/\n/, | |
286 | qq/Warning: Unrecognized character set: $charset\n/, | |
287 | qq/ Message-Id: <$mhonarc::MHAmsgid>\n/, | |
288 | qq/ Message Number: $mhonarc::MHAmsgnum\n/; | |
289 | esc_chars_inplace($data); | |
290 | } | |
291 | ||
292 | } else { | |
293 | esc_chars_inplace($data); | |
294 | } | |
295 | ||
296 | if ($textformat eq 'flowed') { | |
297 | # Initial code for format=flowed contributed by Ken Hirsch (May 2002). | |
298 | # text/plain; format=flowed defined in RFC2646 | |
299 | ||
300 | my $currdepth = 0; | |
301 | my $ret=''; | |
302 | $$data =~ s!^</?x-flowed>\r?\n>!!mg; | |
303 | while (length($$data)) { | |
304 | $$data =~ /^((?:>)*)/; | |
305 | my $qd = $1; | |
306 | if ($$data =~ s/^(.*(?:(?:\n|\r\n?)$qd(?!>).*)*\n?)//) { | |
307 | # divide message into chunks by "quote-depth", | |
308 | # which is the number of leading > signs | |
309 | my $chunk = $1; | |
310 | $chunk =~ s/^$qd ?//mg; # N.B. also takes care of | |
311 | # space-stuffing | |
312 | $chunk =~ s/^-- $/--/mg; # special case for '-- ' | |
313 | ||
314 | if ($chunk =~ / \r?\n/) { | |
315 | # Treat this chunk as format=flowed | |
316 | # Lines that end with spaces are | |
317 | # considered to have soft line breaks. | |
318 | # Lines that end with no spaces are | |
319 | # considered to have hard line breaks. | |
320 | # XXX: Negative look-behind assertion not supported | |
321 | # on older versions of Perl 5 (<5.6) | |
322 | #$chunk =~ s/(?<! )(\r?\n|\Z)/<br>$1/g; | |
323 | $chunk =~ s/(^|[^ ])(\r?\n|\Z)/$1<br>$2/mg; | |
324 | ||
325 | } else { | |
326 | # Treat this chunk as format=fixed | |
327 | if ($nonfixed) { | |
328 | $chunk =~ s/(\r?\n)/<br>$1/g; | |
329 | if ($keepspace) { | |
330 | $chunk =~ s/^(.*)$/&preserve_space($1)/gem; | |
331 | } | |
332 | } else { | |
333 | $chunk = "<pre>" . $chunk . "</pre>\n"; | |
334 | } | |
335 | } | |
336 | my $newdepth = length($qd)/length('>'); | |
337 | if ($currdepth < $newdepth) { | |
338 | $chunk = $StartFlowedQuote x | |
339 | ($newdepth - $currdepth) . $chunk; | |
340 | } elsif ($currdepth > $newdepth) { | |
341 | $chunk = $EndFlowedQuote x | |
342 | ($currdepth - $newdepth) . $chunk; | |
343 | } | |
344 | $currdepth = $newdepth; | |
345 | $ret .= $chunk; | |
346 | ||
347 | } else { | |
348 | # The above regex should always match, but just in case... | |
349 | warn qq/\n/, | |
350 | qq/Warning: Dequoting problem with format=flowed data\n/, | |
351 | qq/ Message-Id: <$MHAmsgid>\n/, | |
352 | qq/ Message Number: $MHAmsgnum\n/; | |
353 | $ret .= $$data; | |
354 | last; | |
355 | } | |
356 | } | |
357 | if ($currdepth > 0) { | |
358 | $ret .= $EndFlowedQuote x $currdepth; | |
359 | } | |
360 | ||
361 | ## Post-processing cleanup: makes things look nicer | |
362 | $ret =~ s/<br><\/blockquote>/<\/blockquote>/g; | |
363 | $ret =~ s/<\/blockquote><br>/<\/blockquote>/g; | |
364 | ||
365 | $$data = $ret; | |
366 | ||
367 | } else { | |
368 | ## Check for quoting | |
369 | if ($doquote) { | |
370 | $$data =~ s@^( ?${HQuoteChars})(.*)$@$1<i>$2</i>@gom; | |
371 | } | |
372 | ||
373 | ## Check if using nonfixed font | |
374 | if ($nonfixed) { | |
375 | $$data =~ s/(\r?\n)/<br>$1/g; | |
376 | if ($keepspace) { | |
377 | $$data =~ s/^(.*)$/&preserve_space($1)/gem; | |
378 | } | |
379 | } else { | |
380 | $$data = "<pre>" . $$data . "</pre>\n"; | |
381 | } | |
382 | } | |
383 | ||
384 | ## Convert URLs to hyperlinks | |
385 | $$data =~ s@($HUrlExp)@<a $target href="$1">$1</a>@gio | |
386 | unless $nourl; | |
387 | ||
388 | ($$data); | |
389 | } | |
390 | ||
391 | ##---------------------------------------------------------------------------## | |
392 | ||
393 | sub esc_chars_inplace { | |
394 | my($foo) = shift; | |
395 | $$foo =~ s/&/&/g; | |
396 | $$foo =~ s/</</g; | |
397 | $$foo =~ s/>/>/g; | |
398 | $$foo =~ s/"/"/g; | |
399 | 1; | |
400 | } | |
401 | ||
402 | ##---------------------------------------------------------------------------## | |
403 | ||
404 | sub preserve_space { | |
405 | my($str) = shift; | |
406 | ||
407 | 1 while | |
408 | $str =~ s/^([^\t]*)(\t+)/$1 . ' ' x (length($2) * 8 - length($1) % 8)/e; | |
409 | $str =~ s/ /\ /g; | |
410 | $str; | |
411 | } | |
412 | ||
413 | ##---------------------------------------------------------------------------## | |
414 | ||
415 | sub break_line { | |
416 | my($str) = shift; | |
417 | my($width) = shift; | |
418 | my($q, $new) = ('', ''); | |
419 | my($try, $trywidth, $len); | |
420 | ||
421 | ## Translate tabs to spaces | |
422 | 1 while | |
423 | $str =~ s/^([^\t]*)(\t+)/$1 . ' ' x (length($2) * 8 - length($1) % 8)/e; | |
424 | ||
425 | ## Do nothing if str <= width | |
426 | return $str if length($str) <= $width; | |
427 | ||
428 | ## See if str begins with a quote char | |
429 | if ($str =~ s/^( ?$QuoteChars)//o) { | |
430 | $q = $1; | |
431 | --$width; | |
432 | } | |
433 | ||
434 | ## Create new string by breaking up str | |
435 | while ($str ne "") { | |
436 | ||
437 | # If $str less than width, break out | |
438 | if (length($str) <= $width) { | |
439 | $new .= $q . $str; | |
440 | last; | |
441 | } | |
442 | ||
443 | # handle case where no-whitespace line larger than width | |
444 | if (($str =~ /^(\S+)/) && (($len = length($1)) >= $width)) { | |
445 | $new .= $q . $1; | |
446 | substr($str, 0, $len) = ""; | |
447 | next; | |
448 | } | |
449 | ||
450 | # Break string at whitespace | |
451 | $try = ''; | |
452 | $trywidth = $width; | |
453 | $try = substr($str, 0, $trywidth); | |
454 | if ($try =~ /(\S+)$/) { | |
455 | $trywidth -= length($1); | |
456 | $new .= $q . substr($str, 0, $trywidth); | |
457 | } else { | |
458 | $new .= $q . $try; | |
459 | } | |
460 | substr($str, 0, $trywidth) = ''; | |
461 | ||
462 | } continue { | |
463 | $new .= "\n" if $str; | |
464 | } | |
465 | $new; | |
466 | } | |
467 | ||
468 | ##---------------------------------------------------------------------------## | |
469 | 1; |