| 1 | ##---------------------------------------------------------------------------## |
| 2 | ## File: |
| 3 | ## $Id: mhtxthtml.pl,v 2.22.2.1 2002/12/22 00:43:56 ehood Exp $ |
| 4 | ## Author: |
| 5 | ## Earl Hood mhonarc@mhonarc.org |
| 6 | ## Description: |
| 7 | ## Library defines routine to filter text/html body parts |
| 8 | ## for MHonArc. |
| 9 | ## Filter routine can be registered with the following: |
| 10 | ## <MIMEFILTERS> |
| 11 | ## text/html:m2h_text_html'filter:mhtxthtml.pl |
| 12 | ## </MIMEFILTERS> |
| 13 | ##---------------------------------------------------------------------------## |
| 14 | ## MHonArc -- Internet mail-to-HTML converter |
| 15 | ## Copyright (C) 1995-2000 Earl Hood, mhonarc@mhonarc.org |
| 16 | ## |
| 17 | ## This program is free software; you can redistribute it and/or modify |
| 18 | ## it under the terms of the GNU General Public License as published by |
| 19 | ## the Free Software Foundation; either version 2 of the License, or |
| 20 | ## (at your option) any later version. |
| 21 | ## |
| 22 | ## This program is distributed in the hope that it will be useful, |
| 23 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 24 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 25 | ## GNU General Public License for more details. |
| 26 | ## |
| 27 | ## You should have received a copy of the GNU General Public License |
| 28 | ## along with this program; if not, write to the Free Software |
| 29 | ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| 30 | ##---------------------------------------------------------------------------## |
| 31 | |
| 32 | |
| 33 | package m2h_text_html; |
| 34 | |
| 35 | # Beginning of URL match expression |
| 36 | my $Url = '(\w+://|\w+:)'; |
| 37 | |
| 38 | # Script related attributes |
| 39 | my $SAttr = q/\b(?:onload|onunload|onclick|ondblclick|/. |
| 40 | q/onmouse(?:down|up|over|move|out)|/. |
| 41 | q/onkey(?:press|down|up)|style)\b/; |
| 42 | # Script/questionable related elements |
| 43 | my $SElem = q/\b(?:applet|base|embed|form|ilayer|input|layer|link|meta|/. |
| 44 | q/object|option|param|select|textarea)\b/; |
| 45 | |
| 46 | # Elements with auto-loaded URL attributes |
| 47 | my $AElem = q/\b(?:img|body|iframe|frame|object|script|input)\b/; |
| 48 | # URL attributes |
| 49 | my $UAttr = q/\b(?:action|background|cite|classid|codebase|data|datasrc|/. |
| 50 | q/dynsrc|for|href|longdesc|profile|src|url|usemap)\b/; |
| 51 | |
| 52 | ##--------------------------------------------------------------------------- |
| 53 | ## The filter must modify HTML content parts for merging into the |
| 54 | ## final filtered HTML messages. Modification is needed so the |
| 55 | ## resulting filtered message is valid HTML. |
| 56 | ## |
| 57 | ## Arguments: |
| 58 | ## |
| 59 | ## allowcomments Preserve any comment declarations. Normally |
| 60 | ## Comment declarations are munged to prevent |
| 61 | ## SSI attacks or comments that can conflict |
| 62 | ## with MHonArc processing. Use this option |
| 63 | ## with care. |
| 64 | ## |
| 65 | ## allownoncidurls Preserve URL-based attributes that are not |
| 66 | ## cid: URLs. Normally, any URL-based attribute |
| 67 | ## -- href, src, background, classid, data, |
| 68 | ## longdesc -- will be stripped if it is not a |
| 69 | ## cid: URL. This is to prevent malicious URLs |
| 70 | ## that verify mail addresses for spam purposes, |
| 71 | ## secretly set cookies, or gather some |
| 72 | ## statistical data automatically with the use of |
| 73 | ## elements that cause browsers to automatically |
| 74 | ## fetch data: IMG, BODY, IFRAME, FRAME, OBJECT, |
| 75 | ## SCRIPT, INPUT. |
| 76 | ## |
| 77 | ## allowscript Preserve any markup associated with scripting. |
| 78 | ## This includes elements and attributes related |
| 79 | ## to scripting. The default is to delete any |
| 80 | ## scripting markup for security reasons. |
| 81 | ## |
| 82 | ## attachcheck Honor attachment disposition. By default, |
| 83 | ## all text/html data is displayed inline on |
| 84 | ## the message page. If attachcheck is specified |
| 85 | ## and Content-Disposition specifies the data as |
| 86 | ## an attachment, the data is saved to a file |
| 87 | ## with a link to it from the message page. |
| 88 | ## |
| 89 | ## nofont Remove <FONT> tags. |
| 90 | ## |
| 91 | ## notitle Do not print title. |
| 92 | ## |
| 93 | sub filter { |
| 94 | my($fields, $data, $isdecode, $args) = @_; |
| 95 | $args = '' unless defined $args; |
| 96 | |
| 97 | ## Check if content-disposition should be checked |
| 98 | if ($args =~ /\battachcheck\b/i) { |
| 99 | my($disp, $nameparm) = readmail::MAILhead_get_disposition($fields); |
| 100 | if ($disp =~ /\battachment\b/i) { |
| 101 | require 'mhexternal.pl'; |
| 102 | return (m2h_external::filter( |
| 103 | $fields, $data, $isdecode, |
| 104 | readmail::get_filter_args('m2h_external::filter'))); |
| 105 | } |
| 106 | } |
| 107 | |
| 108 | local(@files) = (); # XXX: Used by resolve_cid!!! |
| 109 | my $base = ''; |
| 110 | my $title = ''; |
| 111 | my $noscript = 1; |
| 112 | $noscript = 0 if $args =~ /\ballowscript\b/i; |
| 113 | my $nofont = $args =~ /\bnofont\b/i; |
| 114 | my $notitle = $args =~ /\bnotitle\b/i; |
| 115 | my $onlycid = $args !~ /\ballownoncidurls\b/i; |
| 116 | my $tmp; |
| 117 | |
| 118 | ## Check comment declarations: may screw-up mhonarc processing |
| 119 | ## and avoids someone sneaking in SSIs. |
| 120 | #$$data =~ s/<!(?:--(?:[^-]|-[^-])*--\s*)+>//go; # can crash perl |
| 121 | $$data =~ s/<!--[^-]+[#X%\$\[]*/<!--/g; # Just mung them (faster) |
| 122 | |
| 123 | ## Get/remove title |
| 124 | if (!$notitle) { |
| 125 | if ($$data =~ s|<title\s*>([^<]*)</title\s*>||io) { |
| 126 | $title = "<address>Title: <strong>$1</strong></address>\n" |
| 127 | unless $1 eq ""; |
| 128 | } |
| 129 | } else { |
| 130 | $$data =~ s|<title\s*>[^<]*</title\s*>||io; |
| 131 | } |
| 132 | |
| 133 | ## Get/remove BASE url |
| 134 | BASEURL: { |
| 135 | if ($$data =~ s|(<base\s[^>]*>)||i) { |
| 136 | $tmp = $1; |
| 137 | if ($tmp =~ m|href\s*=\s*['"]([^'"]+)['"]|i) { |
| 138 | $base = $1; |
| 139 | } elsif ($tmp =~ m|href\s*=\s*([^\s>]+)|i) { |
| 140 | $base = $1; |
| 141 | } |
| 142 | last BASEURL if ($base =~ /\S/); |
| 143 | } |
| 144 | if ((defined($tmp = $fields->{'content-base'}[0]) || |
| 145 | defined($tmp = $fields->{'content-location'}[0])) && |
| 146 | ($tmp =~ m%/%)) { |
| 147 | ($base = $tmp) =~ s/['"\s]//g; |
| 148 | } |
| 149 | } |
| 150 | $base =~ s|(.*/).*|$1|; |
| 151 | |
| 152 | ## Strip out certain elements/tags to support proper inclusion |
| 153 | $$data =~ s|<head\s*>[\s\S]*</head\s*>||io; |
| 154 | 1 while ($$data =~ s|<!doctype\s[^>]*>||io); |
| 155 | 1 while ($$data =~ s|</?html\b[^>]*>||gio); |
| 156 | 1 while ($$data =~ s|</?x-html\b[^>]*>||gio); |
| 157 | 1 while ($$data =~ s|</?meta\b[^>]*>||gio); |
| 158 | 1 while ($$data =~ s|</?link\b[^>]*>||gio); |
| 159 | |
| 160 | ## Strip out <font> tags if requested |
| 161 | if ($nofont) { |
| 162 | $$data =~ s|<style[^>]*>.*?</style\s*>||gios; |
| 163 | 1 while ($$data =~ s|</?font\b[^>]*>||gio); |
| 164 | 1 while ($$data =~ s/\b(?:style|class)\s*=\s*"[^"]*"//gio); |
| 165 | 1 while ($$data =~ s/\b(?:style|class)\s*=\s*'[^']*'//gio); |
| 166 | 1 while ($$data =~ s/\b(?:style|class)\s*=\s*[^\s>]+//gio); |
| 167 | 1 while ($$data =~ s|</?style\b[^>]*>||gi); |
| 168 | |
| 169 | } |
| 170 | |
| 171 | ## Strip out scripting markup if requested |
| 172 | if ($noscript) { |
| 173 | # remove scripting elements and attributes |
| 174 | $$data =~ s|<script[^>]*>.*?</script\s*>||gios; |
| 175 | unless ($nofont) { # avoid dup work if style already stripped |
| 176 | $$data =~ s|<style[^>]*>.*?</style\s*>||gios; |
| 177 | 1 while ($$data =~ s|</?style\b[^>]*>||gi); |
| 178 | } |
| 179 | 1 while ($$data =~ s|$SAttr\s*=\s*"[^"]*"||gio); #" |
| 180 | 1 while ($$data =~ s|$SAttr\s*=\s*'[^']*'||gio); #' |
| 181 | 1 while ($$data =~ s|$SAttr\s*=\s*[^\s>]+||gio); |
| 182 | 1 while ($$data =~ s|</?$SElem[^>]*>||gio); |
| 183 | 1 while ($$data =~ s|</?script\b||gi); |
| 184 | |
| 185 | # for netscape 4.x browsers |
| 186 | $$data =~ s/(=\s*["']?\s*)(?:\&\{)+/$1/g; |
| 187 | |
| 188 | # Hopefully complete pattern to neutralize javascript:... URLs. |
| 189 | # The pattern is ugly because we have to handle any combination |
| 190 | # of regular chars and entity refs. |
| 191 | $$data =~ s/\b(?:j|&\#(?:0*(?:74|106)|x0*(?:4a|6a))(?:;|(?![0-9]))) |
| 192 | (?:a|&\#(?:0*(?:65|97)|x0*(?:41|61))(?:;|(?![0-9]))) |
| 193 | (?:v|&\#(?:0*(?:86|118)|x0*(?:56|76))(?:;|(?![0-9]))) |
| 194 | (?:a|&\#(?:0*(?:65|97)|x0*(?:41|61))(?:;|(?![0-9]))) |
| 195 | (?:s|&\#(?:0*(?:83|115)|x0*(?:53|73))(?:;|(?![0-9]))) |
| 196 | (?:c|&\#(?:0*(?:67|99)|x0*(?:43|63))(?:;|(?![0-9]))) |
| 197 | (?:r|&\#(?:0*(?:82|114)|x0*(?:52|72))(?:;|(?![0-9]))) |
| 198 | (?:i|&\#(?:0*(?:73|105)|x0*(?:49|69))(?:;|(?![0-9]))) |
| 199 | (?:p|&\#(?:0*(?:80|112)|x0*(?:50|70))(?:;|(?![0-9]))) |
| 200 | (?:t|&\#(?:0*(?:84|116)|x0*(?:54|74))(?:;|(?![0-9]))) |
| 201 | /_javascript_/gix; |
| 202 | |
| 203 | } |
| 204 | |
| 205 | ## Modify relative urls to absolute using BASE |
| 206 | if ($base =~ /\S/) { |
| 207 | $$data =~ s/($UAttr\s*=\s*['"])([^'"]+)(['"])/ |
| 208 | join("", $1, &addbase($base,$2), $3)/geoix; |
| 209 | } |
| 210 | |
| 211 | ## Check for frames: Do not support, so just show source |
| 212 | if ($$data =~ m/<frameset\b/i) { |
| 213 | $$data = join('', '<pre>', mhonarc::htmlize($$data), '</pre>'); |
| 214 | return ($title.$$data, @files); |
| 215 | } |
| 216 | |
| 217 | ## Check for body attributes |
| 218 | if ($$data =~ s|<body\b([^>]*)>||i) { |
| 219 | require 'mhutil.pl'; |
| 220 | my $a = $1; |
| 221 | my %attr = mhonarc::parse_vardef_str($a, 1); |
| 222 | if (%attr) { |
| 223 | ## Use a table with a single cell to encapsulate data to |
| 224 | ## set visual properties. We use a mixture of old attributes |
| 225 | ## and CSS to set properties since browsers may not support |
| 226 | ## all of the CSS settings via the STYLE attribute. |
| 227 | my $tpre = '<table width="100%"><tr><td '; |
| 228 | my $tsuf = ""; |
| 229 | $tpre .= qq|background="$attr{'background'}" | |
| 230 | if $attr{'background'}; |
| 231 | $tpre .= qq|bgcolor="$attr{'bgcolor'}" | |
| 232 | if $attr{'bgcolor'}; |
| 233 | $tpre .= qq|style="|; |
| 234 | $tpre .= qq|background-color: $attr{'bgcolor'}; | |
| 235 | if $attr{'bgcolor'}; |
| 236 | if ($attr{'background'}) { |
| 237 | if ($attr{'background'} = |
| 238 | &resolve_cid($onlycid, $attr{'background'})) { |
| 239 | $tpre .= qq|background-image: url($attr{'background'}) |; |
| 240 | } |
| 241 | } |
| 242 | $tpre .= qq|color: $attr{'text'}; | |
| 243 | if $attr{'text'}; |
| 244 | $tpre .= qq|a:link { color: $attr{'link'} } | |
| 245 | if $attr{'link'}; |
| 246 | $tpre .= qq|a:active { color: $attr{'alink'} } | |
| 247 | if $attr{'alink'}; |
| 248 | $tpre .= qq|a:visited { color: $attr{'vlink'} } | |
| 249 | if $attr{'vlink'}; |
| 250 | $tpre .= '">'; |
| 251 | if ($attr{'text'}) { |
| 252 | $tpre .= qq|<font color="$attr{'text'}">|; |
| 253 | $tsuf .= '</font>'; |
| 254 | } |
| 255 | $tsuf .= '</td></tr></table>'; |
| 256 | $$data = $tpre . $$data . $tsuf; |
| 257 | } |
| 258 | } |
| 259 | 1 while ($$data =~ s|</?body[^>]*>||ig); |
| 260 | |
| 261 | ## Check for CID URLs (multipart/related HTML) |
| 262 | $$data =~ s/($UAttr\s*=\s*['"])([^'"]+)(['"])/ |
| 263 | join("", $1, &resolve_cid($onlycid, $2), $3)/geoix; |
| 264 | $$data =~ s/($UAttr\s*=\s*)([^'">][^\s>]+)/ |
| 265 | join("", $1, '"', &resolve_cid($onlycid, $2), '"')/geoix; |
| 266 | |
| 267 | ($title.$$data, @files); |
| 268 | } |
| 269 | |
| 270 | ##--------------------------------------------------------------------------- |
| 271 | |
| 272 | sub addbase { |
| 273 | my($b, $u) = @_; |
| 274 | return $u if !defined($b) || $b !~ /\S/; |
| 275 | |
| 276 | my($ret); |
| 277 | $u =~ s/^\s+//; |
| 278 | if ($u =~ m%^$Url%o || $u =~ m/^#/) { |
| 279 | ## Absolute URL or scroll link; do nothing |
| 280 | $ret = $u; |
| 281 | } else { |
| 282 | ## Relative URL |
| 283 | if ($u =~ /^\./) { |
| 284 | ## "./---" or "../---": Need to remove and adjust base |
| 285 | ## accordingly. |
| 286 | $b =~ s/\/$//; |
| 287 | my @a = split(/\//, $b); |
| 288 | my $cnt = 0; |
| 289 | while ( $cnt <= scalar(@a) && |
| 290 | $u =~ s|^(\.{1,2})/|| ) { ++$cnt if length($1) == 2; } |
| 291 | splice(@a, -$cnt) if $cnt > 0; |
| 292 | $b = join('/', @a, ""); |
| 293 | |
| 294 | } elsif ($u =~ m%^/%) { |
| 295 | ## "/---": Just use hostname:port of base. |
| 296 | $b =~ s%^(${Url}[^/]*)/.*%$1%o; |
| 297 | } |
| 298 | $ret = $b . $u; |
| 299 | } |
| 300 | $ret; |
| 301 | } |
| 302 | |
| 303 | ##--------------------------------------------------------------------------- |
| 304 | |
| 305 | sub resolve_cid { |
| 306 | my $onlycid = shift; |
| 307 | my $cid = shift; |
| 308 | my $href = $readmail::Cid{$cid}; |
| 309 | if (!defined($href)) { |
| 310 | my $basename = $cid; |
| 311 | $basename =~ s/.*\///; |
| 312 | if (!defined($href = $readmail::Cid{$basename})) { |
| 313 | return "" if $onlycid; |
| 314 | return ($cid =~ /^cid:/i)? "": $cid; |
| 315 | } |
| 316 | $cid = $basename; |
| 317 | } |
| 318 | |
| 319 | if ($href->{'uri'}) { |
| 320 | # Part already converted; multiple references to part |
| 321 | return $href->{'uri'}; |
| 322 | } |
| 323 | |
| 324 | require 'mhmimetypes.pl'; |
| 325 | my $filename; |
| 326 | my $decodefunc = |
| 327 | readmail::load_decoder( |
| 328 | $href->{'fields'}{'content-transfer-encoding'}[0]); |
| 329 | if (defined($decodefunc) && defined(&$decodefunc)) { |
| 330 | my $data = &$decodefunc(${$href->{'body'}}); |
| 331 | $filename = mhonarc::write_attachment( |
| 332 | $href->{'fields'}{'content-type'}[0], \$data); |
| 333 | } else { |
| 334 | $filename = mhonarc::write_attachment( |
| 335 | $href->{'fields'}{'content-type'}[0], |
| 336 | $href->{'body'}); |
| 337 | } |
| 338 | $href->{'filtered'} = 1; # mark part filtered for readmail.pl |
| 339 | $href->{'uri'} = $filename; |
| 340 | |
| 341 | push(@files, $filename); # @files defined in filter!! |
| 342 | $filename; |
| 343 | } |
| 344 | |
| 345 | ##--------------------------------------------------------------------------- |
| 346 | |
| 347 | 1; |