Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | # rfc822.pl -- A perl package to manipulate RFC822 mail headers |
2 | # A. P. Barrett <barrett@ee.und.ac.za>, June 1993 | |
3 | # $Revision: 1.1 $$Date: 1996/09/17 13:37:40 $ | |
4 | ||
5 | # Synopsis: | |
6 | # require 'rfc822.pl'; | |
7 | # | |
8 | # # sample input | |
9 | # $string = 'Joe (Random) User <@route:"j.r.l"@host.com>'; | |
10 | # | |
11 | # @toks = &rfc822'tokenise($string); | |
12 | # # Convert string to tokens. | |
13 | # # In an array context, returns: | |
14 | # # ('Joe', '(Random)', 'User', '<', '@', 'route', ':', | |
15 | # # '"j.r.l"', '@', 'host', '.', 'com', '>') | |
16 | # # Not intended for use in a scalar context, but would return: | |
17 | # # 'Joe(Random)User<@route:"j.r.l"@host.com>' | |
18 | # | |
19 | # $newstring = &rfc822'untokenise(@toks); | |
20 | # # Convert tokens to string with minimum white space. | |
21 | # # Not intended for use in an array context. | |
22 | # # In a scalar context, returns: | |
23 | # # 'Joe(Random)User<@route:"j.r.l"@host.com>' | |
24 | # | |
25 | # @newtoks = &rfc822'uncomment($string); | |
26 | # @newtoks = &rfc822'uncomment(@toks); | |
27 | # $newstring = &rfc822'uncomment($string); | |
28 | # $newstring = &rfc822'uncomment(@toks); | |
29 | # # Remove comments. | |
30 | # # In an array context, returns: | |
31 | # # ('Joe', 'User', '<', '@', 'route', ':', | |
32 | # # '"j.r.l"', '@', 'host', '.', 'com', '>') | |
33 | # # In a scalar context, returns: | |
34 | # # 'Joe User<@route:"j.r.l"@host.com>' | |
35 | # | |
36 | # @newtoks = &rfc822'first_route_addr($string); | |
37 | # @newtoks = &rfc822'first_route_addr(@toks); | |
38 | # $newstring = &rfc822'first_route_addr($string); | |
39 | # $newstring = &rfc822'first_route_addr(@toks); | |
40 | # # Obtain first route-addr or addr-spec. | |
41 | # # In an array context, returns: | |
42 | # # ('<', '@', 'route', ':', | |
43 | # # '"j.r.l"', '@', 'host', '.', 'com', '>') | |
44 | # # In a scalar context, returns: | |
45 | # # '<@route:"j.r.l"@host.com>' | |
46 | # | |
47 | # @newtoks = &rfc822'first_addr_spec($string); | |
48 | # @newtoks = &rfc822'first_addr_spec(@toks); | |
49 | # $newstring = &rfc822'first_addr_spec($string); | |
50 | # $newstring = &rfc822'first_addr_spec(@toks); | |
51 | # # Obtain first addr-spec. | |
52 | # # In an array context, returns: | |
53 | # # ('"j.r.l"', '@', 'host', '.', 'com') | |
54 | # # In a scalar context, returns: | |
55 | # # '"j.r.l"@host.com' | |
56 | ||
57 | package rfc822; | |
58 | ||
59 | # Define some variables to help us write regexps. | |
60 | $self_delimiters = '<>@,;:.'; # use /[$self_delimiters]/ | |
61 | $specials = $self_delimiters.'()\\\\"\\[\\]'; # use /[$specials]/ | |
62 | $quoted_pair = '\\\\.'; # use /$quoted_pair/ | |
63 | $qp_or_bs_end = $quoted_pair.'|\\\\$'; # use /$qp_or_bs_end/ | |
64 | ||
65 | # Tokenise, per RFC 822. | |
66 | # | |
67 | # As an extension, allows atoms to contain quoted pairs. | |
68 | # The last output token might contain an unterminated quoted pair, | |
69 | # comment, domain literal or quoted string. | |
70 | # Other output tokens might contain solitary unmatched special characters. | |
71 | # | |
72 | # Input is a single string. | |
73 | # In an array context, output is a list of tokens. | |
74 | # In a scalar context, output is a single string (not very useful). | |
75 | sub tokenise | |
76 | { | |
77 | local ($_) = @_; | |
78 | local (@outtoks); | |
79 | local ($firstchar); | |
80 | local ($comment, $comment_depth); | |
81 | ||
82 | while (s/^\s*(\S)/$firstchar = $1/e) { | |
83 | if ($firstchar =~ /[$self_delimiters]/o) { | |
84 | # a special character as a self-delimiting token. | |
85 | s/^(.)//; | |
86 | push (@outtoks, $1); | |
87 | } elsif ($firstchar eq '"') { | |
88 | # a quoted string. | |
89 | # XXX we don't prohibit bare CR. | |
90 | s/^(\"($qp_or_bs_end|[^\\"])*\")//o; | |
91 | push (@outtoks, $1); | |
92 | } elsif ($firstchar eq '[') { | |
93 | # a domain literal. | |
94 | # XXX we don't prohibit bare CR or '['. | |
95 | s/^(\[($qp_or_bs_end|[^\\\]])*(\]|$))//o; | |
96 | push (@outtoks, $1); | |
97 | } elsif ($firstchar eq '(') { | |
98 | # a comment. | |
99 | do { | |
100 | s/^([^()]*([()]|$))//; | |
101 | $comment .= $1; | |
102 | $comment_depth++ if $2 eq '('; | |
103 | $comment_depth-- if $2 eq ')'; | |
104 | do { | |
105 | # XXX error recovery for unterminated comment | |
106 | $comment_depth = 0; | |
107 | } if $2 eq ''; | |
108 | } until ($comment_depth == 0); | |
109 | push (@outtoks, $comment); | |
110 | } elsif ($firstchar ne '\\' && $firstchar =~ /[$specials]/o) { | |
111 | # an illegal special character. | |
112 | s/^(.)//; | |
113 | push (@outtoks, $1); | |
114 | } else { | |
115 | # should be an atom, which is not allowed to contain | |
116 | # special characters or control characters. | |
117 | # we have already checked for all special chars except | |
118 | # controls and backslash. | |
119 | # XXX we don't check for controls. | |
120 | # XXX we allow a quoted-pair as part of an atom. | |
121 | s/^(($qp_or_bs_end|[^\s$specials])+)//o; | |
122 | push (@outtoks, $1); | |
123 | } | |
124 | } | |
125 | ||
126 | # return result | |
127 | wantarray ? @outtoks : &untokenise(@outtoks); | |
128 | } | |
129 | ||
130 | # Convert a list of tokens to a single string. | |
131 | # | |
132 | # Just pastes the tokens together, with blanks where they are essential. | |
133 | # | |
134 | # Input is a list of tokens. | |
135 | # Output is a single string. | |
136 | sub untokenise | |
137 | { | |
138 | local ($token, $prevtok); | |
139 | local ($result); | |
140 | local ($prev, $this); | |
141 | ||
142 | foreach $token (@_) { | |
143 | # Do we need a space? | |
144 | # A space is essential when both the left and right tokens | |
145 | # are either atoms or quoted strings. | |
146 | # XXX - Spaces are desirable in some other places, but for | |
147 | # now it's too difficult to worry about that. It's | |
148 | # context-dependent anyway -- for example, we sometimes | |
149 | # want spaces after ':' and ',', but not when they appear | |
150 | # inside a route-addr. The tokener has no business knowing | |
151 | # about such details. | |
152 | if ($result ne '') { | |
153 | $prev = substr($prevtok, $[, 1); | |
154 | $this = substr($token, $[, 1); | |
155 | if ( ($this eq '"' || $this !~ /[$specials]/o) | |
156 | && ($prev eq '"' || $prev !~ /[$specials]/o)) | |
157 | { | |
158 | $result .= ' '; | |
159 | } | |
160 | } | |
161 | $result .= $token; | |
162 | $prevtok = $token; | |
163 | } | |
164 | ||
165 | # return result | |
166 | $result; | |
167 | } | |
168 | ||
169 | # Delete comments. | |
170 | # | |
171 | # Input can be a single string or a list of tokens. | |
172 | # In an array context, output is a list of tokens. | |
173 | # In a scalar context, output is a single string. | |
174 | sub uncomment | |
175 | { | |
176 | local (@intoks) = @_; | |
177 | local (@outtoks); | |
178 | local ($token); | |
179 | ||
180 | # tokenise the input if we were given a single string | |
181 | @intoks = &tokenise($intoks[$[]) if $#intoks le $[; | |
182 | ||
183 | # delete comment tokens | |
184 | @outtoks = grep (/^[^(]/, @intoks); | |
185 | ||
186 | # return result | |
187 | wantarray ? @outtoks : &untokenise(@outtoks); | |
188 | } | |
189 | ||
190 | # Try to extract a single RFC-822 route-addr or addr-spec from a | |
191 | # list of addresses. | |
192 | # | |
193 | # Returns the first route-addr or addr-spec if there are several | |
194 | # (for example, if the input is a comma-separated list).. | |
195 | # Garbage in, garbage out. | |
196 | # | |
197 | # Input can be a single string or a list of tokens. | |
198 | # In an array context, output is a list of tokens. | |
199 | # In a scalar context, output is a single string. | |
200 | sub first_route_addr | |
201 | { | |
202 | local (@intoks) = @_; | |
203 | local (@outtoks); | |
204 | local ($token, $firstchar); | |
205 | local ($state) = 'start'; | |
206 | ||
207 | # tokenise the input if we were given a single string | |
208 | @intoks = &tokenise($intoks[$[]) if $#intoks le $[; | |
209 | ||
210 | foreach $token (@intoks) { | |
211 | $firstchar = substr($token,0,1); | |
212 | if ($firstchar eq '(') { | |
213 | # ignore comments | |
214 | next; | |
215 | } elsif ($firstchar eq '<') { | |
216 | # '<' is start of route-addr. | |
217 | # discard what came before. | |
218 | $state = 'routeaddr'; | |
219 | @outtoks = ($token); | |
220 | } elsif ($firstchar eq ':') { | |
221 | # ':' might be end of phrase for a group, | |
222 | # or might be end of route and start of addr-spec in route-addr. | |
223 | if ($state eq 'routeaddr') { | |
224 | push (@outtoks, $token); | |
225 | } else { | |
226 | $state = 'start'; | |
227 | @outtoks = (); | |
228 | } | |
229 | } elsif ($firstchar eq ',') { | |
230 | # ',' might be a separator between addresses | |
231 | # or might be part of a route inside a route-addr. | |
232 | if ($state eq 'routeaddr') { | |
233 | push (@outtoks, $token); | |
234 | } else { | |
235 | $state = 'start'; | |
236 | last if $#outtoks ge $[; # we got what we wanted | |
237 | } | |
238 | } elsif ($firstchar eq '>') { | |
239 | # '>' is end of route-addr | |
240 | push (@outtoks, $token); | |
241 | $state = 'end'; | |
242 | last; # we got what we wanted | |
243 | } elsif ($firstchar eq ';') { | |
244 | # ';' is end of group | |
245 | $state = 'end'; | |
246 | last if $#outtoks ge $[; # we got what we wanted | |
247 | } else { | |
248 | # accumulate valid tokens. | |
249 | push (@outtoks, $token); | |
250 | } | |
251 | } | |
252 | ||
253 | # return result | |
254 | wantarray ? @outtoks : &untokenise(@outtoks); | |
255 | } | |
256 | ||
257 | # Try to extract a single RFC-822 addr-spec from a list of addresses. | |
258 | # | |
259 | # Returns the first addr-spec if there are several. | |
260 | # Garbage in, garbage out. | |
261 | # | |
262 | # Input can be a single string or a list of tokens. | |
263 | # In an array context, output is a list of tokens. | |
264 | # In a scalar context, output is a single string. | |
265 | sub first_addr_spec | |
266 | { | |
267 | local (@intoks) = @_; | |
268 | local ($token); | |
269 | local ($i, $startpos, $endpos); | |
270 | ||
271 | # Get the first route-addr or addr-spec | |
272 | @intoks = &first_route_addr(@intoks); | |
273 | ||
274 | # if starts with '<' then it was a route-addr. | |
275 | # Keep the stuff between the last ':' (if any) and the first '>'. | |
276 | if ($intoks[$[] eq '<') { | |
277 | $startpos = $[+1; # skip the initial '<' | |
278 | $endpos = $#intoks; # don't yet know if there is a final '>' | |
279 | foreach $i ($startpos..$endpos) { | |
280 | $token = $intoks[$i]; | |
281 | if ($token eq '>') { | |
282 | $endpos = $i - 1; | |
283 | last; | |
284 | } elsif ($token eq ':') { | |
285 | $startpos = $i + 1; | |
286 | } | |
287 | } | |
288 | } | |
289 | # if it didn't start with '<' then it was an addr-spec | |
290 | else { | |
291 | $startpos = $[; | |
292 | $endpos = $#intoks; | |
293 | } | |
294 | ||
295 | # return result | |
296 | wantarray ? @intoks[$startpos..$endpos] | |
297 | : &untokenise(@intoks[$startpos..$endpos]); | |
298 | } | |
299 | ||
300 | # Lame attempt at some standalone test code. | |
301 | # I don't know a good way to tell if we were called from 'require' | |
302 | # or as a standalone program, so we guess by examining $0. | |
303 | if ($0 =~ /(^|\/)rfc822\.pl$/) { | |
304 | ||
305 | package main; | |
306 | while (<>) { | |
307 | $string = $_; | |
308 | print "input:\t$string"; | |
309 | @toks = &rfc822'tokenise($string); | |
310 | print "tokenise:\n\t", join("\n\t", @toks), "\n"; | |
311 | print "untokenise: ", &rfc822'untokenise(@toks), "\n"; | |
312 | foreach $op ('uncomment', 'first_route_addr', 'first_addr_spec') { | |
313 | ## just test the scalar to scalar version | |
314 | eval qq[ | |
315 | \$newstring = &rfc822'$op(\$string); | |
316 | print "$op:\t", \$newstring, "\n"; | |
317 | ]; | |
318 | ## test all four permutations | |
319 | ## of scalar and array inputs and outputs | |
320 | # eval qq[ | |
321 | # print "$op:\n"; | |
322 | # \@newtoks = &rfc822'$op(\$string); | |
323 | # print " s-->a:\n\t", join("\n\t", \@newtoks), "\n"; | |
324 | # \$newstring = &rfc822'$op(\$string); | |
325 | # print " s-->s:\t", \$newstring, "\n"; | |
326 | # \@newtoks = &rfc822'$op(\@toks); | |
327 | # print " a-->a:\n\t", join("\n\t", \@newtoks), "\n"; | |
328 | # \$newstring = &rfc822'$op(\@toks); | |
329 | # print " a-->s:\t", \$newstring, "\n"; | |
330 | # ]; | |
331 | } | |
332 | } | |
333 | exit 0; | |
334 | ||
335 | } | |
336 | ||
337 | 1; # for require |