Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | ##---------------------------------------------------------------------------## |
2 | ## File: | |
3 | ## $Id: mhtxthtml.pl,v 2.22.2.1 2002/12/22 00:43:56 ehood Exp $ | |
4 | ## Author: | |
5 | ## Earl Hood mhonarc@mhonarc.org | |
6 | ## Description: | |
7 | ## Library defines routine to filter text/html body parts | |
8 | ## for MHonArc. | |
9 | ## Filter routine can be registered with the following: | |
10 | ## <MIMEFILTERS> | |
11 | ## text/html:m2h_text_html'filter:mhtxthtml.pl | |
12 | ## </MIMEFILTERS> | |
13 | ##---------------------------------------------------------------------------## | |
14 | ## MHonArc -- Internet mail-to-HTML converter | |
15 | ## Copyright (C) 1995-2000 Earl Hood, mhonarc@mhonarc.org | |
16 | ## | |
17 | ## This program is free software; you can redistribute it and/or modify | |
18 | ## it under the terms of the GNU General Public License as published by | |
19 | ## the Free Software Foundation; either version 2 of the License, or | |
20 | ## (at your option) any later version. | |
21 | ## | |
22 | ## This program is distributed in the hope that it will be useful, | |
23 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
24 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
25 | ## GNU General Public License for more details. | |
26 | ## | |
27 | ## You should have received a copy of the GNU General Public License | |
28 | ## along with this program; if not, write to the Free Software | |
29 | ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
30 | ##---------------------------------------------------------------------------## | |
31 | ||
32 | ||
33 | package m2h_text_html; | |
34 | ||
35 | # Beginning of URL match expression | |
36 | my $Url = '(\w+://|\w+:)'; | |
37 | ||
38 | # Script related attributes | |
39 | my $SAttr = q/\b(?:onload|onunload|onclick|ondblclick|/. | |
40 | q/onmouse(?:down|up|over|move|out)|/. | |
41 | q/onkey(?:press|down|up)|style)\b/; | |
42 | # Script/questionable related elements | |
43 | my $SElem = q/\b(?:applet|base|embed|form|ilayer|input|layer|link|meta|/. | |
44 | q/object|option|param|select|textarea)\b/; | |
45 | ||
46 | # Elements with auto-loaded URL attributes | |
47 | my $AElem = q/\b(?:img|body|iframe|frame|object|script|input)\b/; | |
48 | # URL attributes | |
49 | my $UAttr = q/\b(?:action|background|cite|classid|codebase|data|datasrc|/. | |
50 | q/dynsrc|for|href|longdesc|profile|src|url|usemap)\b/; | |
51 | ||
52 | ##--------------------------------------------------------------------------- | |
53 | ## The filter must modify HTML content parts for merging into the | |
54 | ## final filtered HTML messages. Modification is needed so the | |
55 | ## resulting filtered message is valid HTML. | |
56 | ## | |
57 | ## Arguments: | |
58 | ## | |
59 | ## allowcomments Preserve any comment declarations. Normally | |
60 | ## Comment declarations are munged to prevent | |
61 | ## SSI attacks or comments that can conflict | |
62 | ## with MHonArc processing. Use this option | |
63 | ## with care. | |
64 | ## | |
65 | ## allownoncidurls Preserve URL-based attributes that are not | |
66 | ## cid: URLs. Normally, any URL-based attribute | |
67 | ## -- href, src, background, classid, data, | |
68 | ## longdesc -- will be stripped if it is not a | |
69 | ## cid: URL. This is to prevent malicious URLs | |
70 | ## that verify mail addresses for spam purposes, | |
71 | ## secretly set cookies, or gather some | |
72 | ## statistical data automatically with the use of | |
73 | ## elements that cause browsers to automatically | |
74 | ## fetch data: IMG, BODY, IFRAME, FRAME, OBJECT, | |
75 | ## SCRIPT, INPUT. | |
76 | ## | |
77 | ## allowscript Preserve any markup associated with scripting. | |
78 | ## This includes elements and attributes related | |
79 | ## to scripting. The default is to delete any | |
80 | ## scripting markup for security reasons. | |
81 | ## | |
82 | ## attachcheck Honor attachment disposition. By default, | |
83 | ## all text/html data is displayed inline on | |
84 | ## the message page. If attachcheck is specified | |
85 | ## and Content-Disposition specifies the data as | |
86 | ## an attachment, the data is saved to a file | |
87 | ## with a link to it from the message page. | |
88 | ## | |
89 | ## nofont Remove <FONT> tags. | |
90 | ## | |
91 | ## notitle Do not print title. | |
92 | ## | |
93 | sub filter { | |
94 | my($fields, $data, $isdecode, $args) = @_; | |
95 | $args = '' unless defined $args; | |
96 | ||
97 | ## Check if content-disposition should be checked | |
98 | if ($args =~ /\battachcheck\b/i) { | |
99 | my($disp, $nameparm) = readmail::MAILhead_get_disposition($fields); | |
100 | if ($disp =~ /\battachment\b/i) { | |
101 | require 'mhexternal.pl'; | |
102 | return (m2h_external::filter( | |
103 | $fields, $data, $isdecode, | |
104 | readmail::get_filter_args('m2h_external::filter'))); | |
105 | } | |
106 | } | |
107 | ||
108 | local(@files) = (); # XXX: Used by resolve_cid!!! | |
109 | my $base = ''; | |
110 | my $title = ''; | |
111 | my $noscript = 1; | |
112 | $noscript = 0 if $args =~ /\ballowscript\b/i; | |
113 | my $nofont = $args =~ /\bnofont\b/i; | |
114 | my $notitle = $args =~ /\bnotitle\b/i; | |
115 | my $onlycid = $args !~ /\ballownoncidurls\b/i; | |
116 | my $tmp; | |
117 | ||
118 | ## Check comment declarations: may screw-up mhonarc processing | |
119 | ## and avoids someone sneaking in SSIs. | |
120 | #$$data =~ s/<!(?:--(?:[^-]|-[^-])*--\s*)+>//go; # can crash perl | |
121 | $$data =~ s/<!--[^-]+[#X%\$\[]*/<!--/g; # Just mung them (faster) | |
122 | ||
123 | ## Get/remove title | |
124 | if (!$notitle) { | |
125 | if ($$data =~ s|<title\s*>([^<]*)</title\s*>||io) { | |
126 | $title = "<address>Title: <strong>$1</strong></address>\n" | |
127 | unless $1 eq ""; | |
128 | } | |
129 | } else { | |
130 | $$data =~ s|<title\s*>[^<]*</title\s*>||io; | |
131 | } | |
132 | ||
133 | ## Get/remove BASE url | |
134 | BASEURL: { | |
135 | if ($$data =~ s|(<base\s[^>]*>)||i) { | |
136 | $tmp = $1; | |
137 | if ($tmp =~ m|href\s*=\s*['"]([^'"]+)['"]|i) { | |
138 | $base = $1; | |
139 | } elsif ($tmp =~ m|href\s*=\s*([^\s>]+)|i) { | |
140 | $base = $1; | |
141 | } | |
142 | last BASEURL if ($base =~ /\S/); | |
143 | } | |
144 | if ((defined($tmp = $fields->{'content-base'}[0]) || | |
145 | defined($tmp = $fields->{'content-location'}[0])) && | |
146 | ($tmp =~ m%/%)) { | |
147 | ($base = $tmp) =~ s/['"\s]//g; | |
148 | } | |
149 | } | |
150 | $base =~ s|(.*/).*|$1|; | |
151 | ||
152 | ## Strip out certain elements/tags to support proper inclusion | |
153 | $$data =~ s|<head\s*>[\s\S]*</head\s*>||io; | |
154 | 1 while ($$data =~ s|<!doctype\s[^>]*>||io); | |
155 | 1 while ($$data =~ s|</?html\b[^>]*>||gio); | |
156 | 1 while ($$data =~ s|</?x-html\b[^>]*>||gio); | |
157 | 1 while ($$data =~ s|</?meta\b[^>]*>||gio); | |
158 | 1 while ($$data =~ s|</?link\b[^>]*>||gio); | |
159 | ||
160 | ## Strip out <font> tags if requested | |
161 | if ($nofont) { | |
162 | $$data =~ s|<style[^>]*>.*?</style\s*>||gios; | |
163 | 1 while ($$data =~ s|</?font\b[^>]*>||gio); | |
164 | 1 while ($$data =~ s/\b(?:style|class)\s*=\s*"[^"]*"//gio); | |
165 | 1 while ($$data =~ s/\b(?:style|class)\s*=\s*'[^']*'//gio); | |
166 | 1 while ($$data =~ s/\b(?:style|class)\s*=\s*[^\s>]+//gio); | |
167 | 1 while ($$data =~ s|</?style\b[^>]*>||gi); | |
168 | ||
169 | } | |
170 | ||
171 | ## Strip out scripting markup if requested | |
172 | if ($noscript) { | |
173 | # remove scripting elements and attributes | |
174 | $$data =~ s|<script[^>]*>.*?</script\s*>||gios; | |
175 | unless ($nofont) { # avoid dup work if style already stripped | |
176 | $$data =~ s|<style[^>]*>.*?</style\s*>||gios; | |
177 | 1 while ($$data =~ s|</?style\b[^>]*>||gi); | |
178 | } | |
179 | 1 while ($$data =~ s|$SAttr\s*=\s*"[^"]*"||gio); #" | |
180 | 1 while ($$data =~ s|$SAttr\s*=\s*'[^']*'||gio); #' | |
181 | 1 while ($$data =~ s|$SAttr\s*=\s*[^\s>]+||gio); | |
182 | 1 while ($$data =~ s|</?$SElem[^>]*>||gio); | |
183 | 1 while ($$data =~ s|</?script\b||gi); | |
184 | ||
185 | # for netscape 4.x browsers | |
186 | $$data =~ s/(=\s*["']?\s*)(?:\&\{)+/$1/g; | |
187 | ||
188 | # Hopefully complete pattern to neutralize javascript:... URLs. | |
189 | # The pattern is ugly because we have to handle any combination | |
190 | # of regular chars and entity refs. | |
191 | $$data =~ s/\b(?:j|&\#(?:0*(?:74|106)|x0*(?:4a|6a))(?:;|(?![0-9]))) | |
192 | (?:a|&\#(?:0*(?:65|97)|x0*(?:41|61))(?:;|(?![0-9]))) | |
193 | (?:v|&\#(?:0*(?:86|118)|x0*(?:56|76))(?:;|(?![0-9]))) | |
194 | (?:a|&\#(?:0*(?:65|97)|x0*(?:41|61))(?:;|(?![0-9]))) | |
195 | (?:s|&\#(?:0*(?:83|115)|x0*(?:53|73))(?:;|(?![0-9]))) | |
196 | (?:c|&\#(?:0*(?:67|99)|x0*(?:43|63))(?:;|(?![0-9]))) | |
197 | (?:r|&\#(?:0*(?:82|114)|x0*(?:52|72))(?:;|(?![0-9]))) | |
198 | (?:i|&\#(?:0*(?:73|105)|x0*(?:49|69))(?:;|(?![0-9]))) | |
199 | (?:p|&\#(?:0*(?:80|112)|x0*(?:50|70))(?:;|(?![0-9]))) | |
200 | (?:t|&\#(?:0*(?:84|116)|x0*(?:54|74))(?:;|(?![0-9]))) | |
201 | /_javascript_/gix; | |
202 | ||
203 | } | |
204 | ||
205 | ## Modify relative urls to absolute using BASE | |
206 | if ($base =~ /\S/) { | |
207 | $$data =~ s/($UAttr\s*=\s*['"])([^'"]+)(['"])/ | |
208 | join("", $1, &addbase($base,$2), $3)/geoix; | |
209 | } | |
210 | ||
211 | ## Check for frames: Do not support, so just show source | |
212 | if ($$data =~ m/<frameset\b/i) { | |
213 | $$data = join('', '<pre>', mhonarc::htmlize($$data), '</pre>'); | |
214 | return ($title.$$data, @files); | |
215 | } | |
216 | ||
217 | ## Check for body attributes | |
218 | if ($$data =~ s|<body\b([^>]*)>||i) { | |
219 | require 'mhutil.pl'; | |
220 | my $a = $1; | |
221 | my %attr = mhonarc::parse_vardef_str($a, 1); | |
222 | if (%attr) { | |
223 | ## Use a table with a single cell to encapsulate data to | |
224 | ## set visual properties. We use a mixture of old attributes | |
225 | ## and CSS to set properties since browsers may not support | |
226 | ## all of the CSS settings via the STYLE attribute. | |
227 | my $tpre = '<table width="100%"><tr><td '; | |
228 | my $tsuf = ""; | |
229 | $tpre .= qq|background="$attr{'background'}" | | |
230 | if $attr{'background'}; | |
231 | $tpre .= qq|bgcolor="$attr{'bgcolor'}" | | |
232 | if $attr{'bgcolor'}; | |
233 | $tpre .= qq|style="|; | |
234 | $tpre .= qq|background-color: $attr{'bgcolor'}; | | |
235 | if $attr{'bgcolor'}; | |
236 | if ($attr{'background'}) { | |
237 | if ($attr{'background'} = | |
238 | &resolve_cid($onlycid, $attr{'background'})) { | |
239 | $tpre .= qq|background-image: url($attr{'background'}) |; | |
240 | } | |
241 | } | |
242 | $tpre .= qq|color: $attr{'text'}; | | |
243 | if $attr{'text'}; | |
244 | $tpre .= qq|a:link { color: $attr{'link'} } | | |
245 | if $attr{'link'}; | |
246 | $tpre .= qq|a:active { color: $attr{'alink'} } | | |
247 | if $attr{'alink'}; | |
248 | $tpre .= qq|a:visited { color: $attr{'vlink'} } | | |
249 | if $attr{'vlink'}; | |
250 | $tpre .= '">'; | |
251 | if ($attr{'text'}) { | |
252 | $tpre .= qq|<font color="$attr{'text'}">|; | |
253 | $tsuf .= '</font>'; | |
254 | } | |
255 | $tsuf .= '</td></tr></table>'; | |
256 | $$data = $tpre . $$data . $tsuf; | |
257 | } | |
258 | } | |
259 | 1 while ($$data =~ s|</?body[^>]*>||ig); | |
260 | ||
261 | ## Check for CID URLs (multipart/related HTML) | |
262 | $$data =~ s/($UAttr\s*=\s*['"])([^'"]+)(['"])/ | |
263 | join("", $1, &resolve_cid($onlycid, $2), $3)/geoix; | |
264 | $$data =~ s/($UAttr\s*=\s*)([^'">][^\s>]+)/ | |
265 | join("", $1, '"', &resolve_cid($onlycid, $2), '"')/geoix; | |
266 | ||
267 | ($title.$$data, @files); | |
268 | } | |
269 | ||
270 | ##--------------------------------------------------------------------------- | |
271 | ||
272 | sub addbase { | |
273 | my($b, $u) = @_; | |
274 | return $u if !defined($b) || $b !~ /\S/; | |
275 | ||
276 | my($ret); | |
277 | $u =~ s/^\s+//; | |
278 | if ($u =~ m%^$Url%o || $u =~ m/^#/) { | |
279 | ## Absolute URL or scroll link; do nothing | |
280 | $ret = $u; | |
281 | } else { | |
282 | ## Relative URL | |
283 | if ($u =~ /^\./) { | |
284 | ## "./---" or "../---": Need to remove and adjust base | |
285 | ## accordingly. | |
286 | $b =~ s/\/$//; | |
287 | my @a = split(/\//, $b); | |
288 | my $cnt = 0; | |
289 | while ( $cnt <= scalar(@a) && | |
290 | $u =~ s|^(\.{1,2})/|| ) { ++$cnt if length($1) == 2; } | |
291 | splice(@a, -$cnt) if $cnt > 0; | |
292 | $b = join('/', @a, ""); | |
293 | ||
294 | } elsif ($u =~ m%^/%) { | |
295 | ## "/---": Just use hostname:port of base. | |
296 | $b =~ s%^(${Url}[^/]*)/.*%$1%o; | |
297 | } | |
298 | $ret = $b . $u; | |
299 | } | |
300 | $ret; | |
301 | } | |
302 | ||
303 | ##--------------------------------------------------------------------------- | |
304 | ||
305 | sub resolve_cid { | |
306 | my $onlycid = shift; | |
307 | my $cid = shift; | |
308 | my $href = $readmail::Cid{$cid}; | |
309 | if (!defined($href)) { | |
310 | my $basename = $cid; | |
311 | $basename =~ s/.*\///; | |
312 | if (!defined($href = $readmail::Cid{$basename})) { | |
313 | return "" if $onlycid; | |
314 | return ($cid =~ /^cid:/i)? "": $cid; | |
315 | } | |
316 | $cid = $basename; | |
317 | } | |
318 | ||
319 | if ($href->{'uri'}) { | |
320 | # Part already converted; multiple references to part | |
321 | return $href->{'uri'}; | |
322 | } | |
323 | ||
324 | require 'mhmimetypes.pl'; | |
325 | my $filename; | |
326 | my $decodefunc = | |
327 | readmail::load_decoder( | |
328 | $href->{'fields'}{'content-transfer-encoding'}[0]); | |
329 | if (defined($decodefunc) && defined(&$decodefunc)) { | |
330 | my $data = &$decodefunc(${$href->{'body'}}); | |
331 | $filename = mhonarc::write_attachment( | |
332 | $href->{'fields'}{'content-type'}[0], \$data); | |
333 | } else { | |
334 | $filename = mhonarc::write_attachment( | |
335 | $href->{'fields'}{'content-type'}[0], | |
336 | $href->{'body'}); | |
337 | } | |
338 | $href->{'filtered'} = 1; # mark part filtered for readmail.pl | |
339 | $href->{'uri'} = $filename; | |
340 | ||
341 | push(@files, $filename); # @files defined in filter!! | |
342 | $filename; | |
343 | } | |
344 | ||
345 | ##--------------------------------------------------------------------------- | |
346 | ||
347 | 1; |