| 1 | # rfc822.pl -- A perl package to manipulate RFC822 mail headers |
| 2 | # A. P. Barrett <barrett@ee.und.ac.za>, June 1993 |
| 3 | # $Revision: 1.1 $$Date: 1996/09/17 13:37:40 $ |
| 4 | |
| 5 | # Synopsis: |
| 6 | # require 'rfc822.pl'; |
| 7 | # |
| 8 | # # sample input |
| 9 | # $string = 'Joe (Random) User <@route:"j.r.l"@host.com>'; |
| 10 | # |
| 11 | # @toks = &rfc822'tokenise($string); |
| 12 | # # Convert string to tokens. |
| 13 | # # In an array context, returns: |
| 14 | # # ('Joe', '(Random)', 'User', '<', '@', 'route', ':', |
| 15 | # # '"j.r.l"', '@', 'host', '.', 'com', '>') |
| 16 | # # Not intended for use in a scalar context, but would return: |
| 17 | # # 'Joe(Random)User<@route:"j.r.l"@host.com>' |
| 18 | # |
| 19 | # $newstring = &rfc822'untokenise(@toks); |
| 20 | # # Convert tokens to string with minimum white space. |
| 21 | # # Not intended for use in an array context. |
| 22 | # # In a scalar context, returns: |
| 23 | # # 'Joe(Random)User<@route:"j.r.l"@host.com>' |
| 24 | # |
| 25 | # @newtoks = &rfc822'uncomment($string); |
| 26 | # @newtoks = &rfc822'uncomment(@toks); |
| 27 | # $newstring = &rfc822'uncomment($string); |
| 28 | # $newstring = &rfc822'uncomment(@toks); |
| 29 | # # Remove comments. |
| 30 | # # In an array context, returns: |
| 31 | # # ('Joe', 'User', '<', '@', 'route', ':', |
| 32 | # # '"j.r.l"', '@', 'host', '.', 'com', '>') |
| 33 | # # In a scalar context, returns: |
| 34 | # # 'Joe User<@route:"j.r.l"@host.com>' |
| 35 | # |
| 36 | # @newtoks = &rfc822'first_route_addr($string); |
| 37 | # @newtoks = &rfc822'first_route_addr(@toks); |
| 38 | # $newstring = &rfc822'first_route_addr($string); |
| 39 | # $newstring = &rfc822'first_route_addr(@toks); |
| 40 | # # Obtain first route-addr or addr-spec. |
| 41 | # # In an array context, returns: |
| 42 | # # ('<', '@', 'route', ':', |
| 43 | # # '"j.r.l"', '@', 'host', '.', 'com', '>') |
| 44 | # # In a scalar context, returns: |
| 45 | # # '<@route:"j.r.l"@host.com>' |
| 46 | # |
| 47 | # @newtoks = &rfc822'first_addr_spec($string); |
| 48 | # @newtoks = &rfc822'first_addr_spec(@toks); |
| 49 | # $newstring = &rfc822'first_addr_spec($string); |
| 50 | # $newstring = &rfc822'first_addr_spec(@toks); |
| 51 | # # Obtain first addr-spec. |
| 52 | # # In an array context, returns: |
| 53 | # # ('"j.r.l"', '@', 'host', '.', 'com') |
| 54 | # # In a scalar context, returns: |
| 55 | # # '"j.r.l"@host.com' |
| 56 | |
| 57 | package rfc822; |
| 58 | |
| 59 | # Define some variables to help us write regexps. |
| 60 | $self_delimiters = '<>@,;:.'; # use /[$self_delimiters]/ |
| 61 | $specials = $self_delimiters.'()\\\\"\\[\\]'; # use /[$specials]/ |
| 62 | $quoted_pair = '\\\\.'; # use /$quoted_pair/ |
| 63 | $qp_or_bs_end = $quoted_pair.'|\\\\$'; # use /$qp_or_bs_end/ |
| 64 | |
| 65 | # Tokenise, per RFC 822. |
| 66 | # |
| 67 | # As an extension, allows atoms to contain quoted pairs. |
| 68 | # The last output token might contain an unterminated quoted pair, |
| 69 | # comment, domain literal or quoted string. |
| 70 | # Other output tokens might contain solitary unmatched special characters. |
| 71 | # |
| 72 | # Input is a single string. |
| 73 | # In an array context, output is a list of tokens. |
| 74 | # In a scalar context, output is a single string (not very useful). |
| 75 | sub tokenise |
| 76 | { |
| 77 | local ($_) = @_; |
| 78 | local (@outtoks); |
| 79 | local ($firstchar); |
| 80 | local ($comment, $comment_depth); |
| 81 | |
| 82 | while (s/^\s*(\S)/$firstchar = $1/e) { |
| 83 | if ($firstchar =~ /[$self_delimiters]/o) { |
| 84 | # a special character as a self-delimiting token. |
| 85 | s/^(.)//; |
| 86 | push (@outtoks, $1); |
| 87 | } elsif ($firstchar eq '"') { |
| 88 | # a quoted string. |
| 89 | # XXX we don't prohibit bare CR. |
| 90 | s/^(\"($qp_or_bs_end|[^\\"])*\")//o; |
| 91 | push (@outtoks, $1); |
| 92 | } elsif ($firstchar eq '[') { |
| 93 | # a domain literal. |
| 94 | # XXX we don't prohibit bare CR or '['. |
| 95 | s/^(\[($qp_or_bs_end|[^\\\]])*(\]|$))//o; |
| 96 | push (@outtoks, $1); |
| 97 | } elsif ($firstchar eq '(') { |
| 98 | # a comment. |
| 99 | do { |
| 100 | s/^([^()]*([()]|$))//; |
| 101 | $comment .= $1; |
| 102 | $comment_depth++ if $2 eq '('; |
| 103 | $comment_depth-- if $2 eq ')'; |
| 104 | do { |
| 105 | # XXX error recovery for unterminated comment |
| 106 | $comment_depth = 0; |
| 107 | } if $2 eq ''; |
| 108 | } until ($comment_depth == 0); |
| 109 | push (@outtoks, $comment); |
| 110 | } elsif ($firstchar ne '\\' && $firstchar =~ /[$specials]/o) { |
| 111 | # an illegal special character. |
| 112 | s/^(.)//; |
| 113 | push (@outtoks, $1); |
| 114 | } else { |
| 115 | # should be an atom, which is not allowed to contain |
| 116 | # special characters or control characters. |
| 117 | # we have already checked for all special chars except |
| 118 | # controls and backslash. |
| 119 | # XXX we don't check for controls. |
| 120 | # XXX we allow a quoted-pair as part of an atom. |
| 121 | s/^(($qp_or_bs_end|[^\s$specials])+)//o; |
| 122 | push (@outtoks, $1); |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | # return result |
| 127 | wantarray ? @outtoks : &untokenise(@outtoks); |
| 128 | } |
| 129 | |
| 130 | # Convert a list of tokens to a single string. |
| 131 | # |
| 132 | # Just pastes the tokens together, with blanks where they are essential. |
| 133 | # |
| 134 | # Input is a list of tokens. |
| 135 | # Output is a single string. |
| 136 | sub untokenise |
| 137 | { |
| 138 | local ($token, $prevtok); |
| 139 | local ($result); |
| 140 | local ($prev, $this); |
| 141 | |
| 142 | foreach $token (@_) { |
| 143 | # Do we need a space? |
| 144 | # A space is essential when both the left and right tokens |
| 145 | # are either atoms or quoted strings. |
| 146 | # XXX - Spaces are desirable in some other places, but for |
| 147 | # now it's too difficult to worry about that. It's |
| 148 | # context-dependent anyway -- for example, we sometimes |
| 149 | # want spaces after ':' and ',', but not when they appear |
| 150 | # inside a route-addr. The tokener has no business knowing |
| 151 | # about such details. |
| 152 | if ($result ne '') { |
| 153 | $prev = substr($prevtok, $[, 1); |
| 154 | $this = substr($token, $[, 1); |
| 155 | if ( ($this eq '"' || $this !~ /[$specials]/o) |
| 156 | && ($prev eq '"' || $prev !~ /[$specials]/o)) |
| 157 | { |
| 158 | $result .= ' '; |
| 159 | } |
| 160 | } |
| 161 | $result .= $token; |
| 162 | $prevtok = $token; |
| 163 | } |
| 164 | |
| 165 | # return result |
| 166 | $result; |
| 167 | } |
| 168 | |
| 169 | # Delete comments. |
| 170 | # |
| 171 | # Input can be a single string or a list of tokens. |
| 172 | # In an array context, output is a list of tokens. |
| 173 | # In a scalar context, output is a single string. |
| 174 | sub uncomment |
| 175 | { |
| 176 | local (@intoks) = @_; |
| 177 | local (@outtoks); |
| 178 | local ($token); |
| 179 | |
| 180 | # tokenise the input if we were given a single string |
| 181 | @intoks = &tokenise($intoks[$[]) if $#intoks le $[; |
| 182 | |
| 183 | # delete comment tokens |
| 184 | @outtoks = grep (/^[^(]/, @intoks); |
| 185 | |
| 186 | # return result |
| 187 | wantarray ? @outtoks : &untokenise(@outtoks); |
| 188 | } |
| 189 | |
| 190 | # Try to extract a single RFC-822 route-addr or addr-spec from a |
| 191 | # list of addresses. |
| 192 | # |
| 193 | # Returns the first route-addr or addr-spec if there are several |
| 194 | # (for example, if the input is a comma-separated list).. |
| 195 | # Garbage in, garbage out. |
| 196 | # |
| 197 | # Input can be a single string or a list of tokens. |
| 198 | # In an array context, output is a list of tokens. |
| 199 | # In a scalar context, output is a single string. |
| 200 | sub first_route_addr |
| 201 | { |
| 202 | local (@intoks) = @_; |
| 203 | local (@outtoks); |
| 204 | local ($token, $firstchar); |
| 205 | local ($state) = 'start'; |
| 206 | |
| 207 | # tokenise the input if we were given a single string |
| 208 | @intoks = &tokenise($intoks[$[]) if $#intoks le $[; |
| 209 | |
| 210 | foreach $token (@intoks) { |
| 211 | $firstchar = substr($token,0,1); |
| 212 | if ($firstchar eq '(') { |
| 213 | # ignore comments |
| 214 | next; |
| 215 | } elsif ($firstchar eq '<') { |
| 216 | # '<' is start of route-addr. |
| 217 | # discard what came before. |
| 218 | $state = 'routeaddr'; |
| 219 | @outtoks = ($token); |
| 220 | } elsif ($firstchar eq ':') { |
| 221 | # ':' might be end of phrase for a group, |
| 222 | # or might be end of route and start of addr-spec in route-addr. |
| 223 | if ($state eq 'routeaddr') { |
| 224 | push (@outtoks, $token); |
| 225 | } else { |
| 226 | $state = 'start'; |
| 227 | @outtoks = (); |
| 228 | } |
| 229 | } elsif ($firstchar eq ',') { |
| 230 | # ',' might be a separator between addresses |
| 231 | # or might be part of a route inside a route-addr. |
| 232 | if ($state eq 'routeaddr') { |
| 233 | push (@outtoks, $token); |
| 234 | } else { |
| 235 | $state = 'start'; |
| 236 | last if $#outtoks ge $[; # we got what we wanted |
| 237 | } |
| 238 | } elsif ($firstchar eq '>') { |
| 239 | # '>' is end of route-addr |
| 240 | push (@outtoks, $token); |
| 241 | $state = 'end'; |
| 242 | last; # we got what we wanted |
| 243 | } elsif ($firstchar eq ';') { |
| 244 | # ';' is end of group |
| 245 | $state = 'end'; |
| 246 | last if $#outtoks ge $[; # we got what we wanted |
| 247 | } else { |
| 248 | # accumulate valid tokens. |
| 249 | push (@outtoks, $token); |
| 250 | } |
| 251 | } |
| 252 | |
| 253 | # return result |
| 254 | wantarray ? @outtoks : &untokenise(@outtoks); |
| 255 | } |
| 256 | |
| 257 | # Try to extract a single RFC-822 addr-spec from a list of addresses. |
| 258 | # |
| 259 | # Returns the first addr-spec if there are several. |
| 260 | # Garbage in, garbage out. |
| 261 | # |
| 262 | # Input can be a single string or a list of tokens. |
| 263 | # In an array context, output is a list of tokens. |
| 264 | # In a scalar context, output is a single string. |
| 265 | sub first_addr_spec |
| 266 | { |
| 267 | local (@intoks) = @_; |
| 268 | local ($token); |
| 269 | local ($i, $startpos, $endpos); |
| 270 | |
| 271 | # Get the first route-addr or addr-spec |
| 272 | @intoks = &first_route_addr(@intoks); |
| 273 | |
| 274 | # if starts with '<' then it was a route-addr. |
| 275 | # Keep the stuff between the last ':' (if any) and the first '>'. |
| 276 | if ($intoks[$[] eq '<') { |
| 277 | $startpos = $[+1; # skip the initial '<' |
| 278 | $endpos = $#intoks; # don't yet know if there is a final '>' |
| 279 | foreach $i ($startpos..$endpos) { |
| 280 | $token = $intoks[$i]; |
| 281 | if ($token eq '>') { |
| 282 | $endpos = $i - 1; |
| 283 | last; |
| 284 | } elsif ($token eq ':') { |
| 285 | $startpos = $i + 1; |
| 286 | } |
| 287 | } |
| 288 | } |
| 289 | # if it didn't start with '<' then it was an addr-spec |
| 290 | else { |
| 291 | $startpos = $[; |
| 292 | $endpos = $#intoks; |
| 293 | } |
| 294 | |
| 295 | # return result |
| 296 | wantarray ? @intoks[$startpos..$endpos] |
| 297 | : &untokenise(@intoks[$startpos..$endpos]); |
| 298 | } |
| 299 | |
| 300 | # Lame attempt at some standalone test code. |
| 301 | # I don't know a good way to tell if we were called from 'require' |
| 302 | # or as a standalone program, so we guess by examining $0. |
| 303 | if ($0 =~ /(^|\/)rfc822\.pl$/) { |
| 304 | |
| 305 | package main; |
| 306 | while (<>) { |
| 307 | $string = $_; |
| 308 | print "input:\t$string"; |
| 309 | @toks = &rfc822'tokenise($string); |
| 310 | print "tokenise:\n\t", join("\n\t", @toks), "\n"; |
| 311 | print "untokenise: ", &rfc822'untokenise(@toks), "\n"; |
| 312 | foreach $op ('uncomment', 'first_route_addr', 'first_addr_spec') { |
| 313 | ## just test the scalar to scalar version |
| 314 | eval qq[ |
| 315 | \$newstring = &rfc822'$op(\$string); |
| 316 | print "$op:\t", \$newstring, "\n"; |
| 317 | ]; |
| 318 | ## test all four permutations |
| 319 | ## of scalar and array inputs and outputs |
| 320 | # eval qq[ |
| 321 | # print "$op:\n"; |
| 322 | # \@newtoks = &rfc822'$op(\$string); |
| 323 | # print " s-->a:\n\t", join("\n\t", \@newtoks), "\n"; |
| 324 | # \$newstring = &rfc822'$op(\$string); |
| 325 | # print " s-->s:\t", \$newstring, "\n"; |
| 326 | # \@newtoks = &rfc822'$op(\@toks); |
| 327 | # print " a-->a:\n\t", join("\n\t", \@newtoks), "\n"; |
| 328 | # \$newstring = &rfc822'$op(\@toks); |
| 329 | # print " a-->s:\t", \$newstring, "\n"; |
| 330 | # ]; |
| 331 | } |
| 332 | } |
| 333 | exit 0; |
| 334 | |
| 335 | } |
| 336 | |
| 337 | 1; # for require |