# rfc822.pl -- A perl package to manipulate RFC822 mail headers
# A. P. Barrett <barrett@ee.und.ac.za>, June 1993
# $Revision: 1.1 $$Date: 1996/09/17 13:37:40 $
# $string = 'Joe (Random) User <@route:"j.r.l"@host.com>';
# @toks = &rfc822'tokenise($string);
# # Convert string to tokens.
# # In an array context, returns:
# # ('Joe', '(Random)', 'User', '<', '@', 'route', ':',
# # '"j.r.l"', '@', 'host', '.', 'com', '>')
# # Not intended for use in a scalar context, but would return:
# # 'Joe(Random)User<@route:"j.r.l"@host.com>'
# $newstring = &rfc822'untokenise(@toks);
# # Convert tokens to string with minimum white space.
# # Not intended for use in an array context.
# # In a scalar context, returns:
# # 'Joe(Random)User<@route:"j.r.l"@host.com>'
# @newtoks = &rfc822'uncomment($string);
# @newtoks = &rfc822'uncomment(@toks);
# $newstring = &rfc822'uncomment($string);
# $newstring = &rfc822'uncomment(@toks);
# # In an array context, returns:
# # ('Joe', 'User', '<', '@', 'route', ':',
# # '"j.r.l"', '@', 'host', '.', 'com', '>')
# # In a scalar context, returns:
# # 'Joe User<@route:"j.r.l"@host.com>'
# @newtoks = &rfc822'first_route_addr($string);
# @newtoks = &rfc822'first_route_addr(@toks);
# $newstring = &rfc822'first_route_addr($string);
# $newstring = &rfc822'first_route_addr(@toks);
# # Obtain first route-addr or addr-spec.
# # In an array context, returns:
# # ('<', '@', 'route', ':',
# # '"j.r.l"', '@', 'host', '.', 'com', '>')
# # In a scalar context, returns:
# # '<@route:"j.r.l"@host.com>'
# @newtoks = &rfc822'first_addr_spec($string);
# @newtoks = &rfc822'first_addr_spec(@toks);
# $newstring = &rfc822'first_addr_spec($string);
# $newstring = &rfc822'first_addr_spec(@toks);
# # Obtain first addr-spec.
# # In an array context, returns:
# # ('"j.r.l"', '@', 'host', '.', 'com')
# # In a scalar context, returns:
# Define some variables to help us write regexps.
$self_delimiters = '<>@,;:.'; # use /[$self_delimiters]/
$specials = $self_delimiters.'()\\\\"\\[\\]'; # use /[$specials]/
$quoted_pair = '\\\\.'; # use /$quoted_pair/
$qp_or_bs_end = $quoted_pair.'|\\\\$'; # use /$qp_or_bs_end/
# As an extension, allows atoms to contain quoted pairs.
# The last output token might contain an unterminated quoted pair,
# comment, domain literal or quoted string.
# Other output tokens might contain solitary unmatched special characters.
# Input is a single string.
# In an array context, output is a list of tokens.
# In a scalar context, output is a single string (not very useful).
local ($comment, $comment_depth);
while (s/^\s*(\S)/$firstchar = $1/e) {
if ($firstchar =~ /[$self_delimiters]/o) {
# a special character as a self-delimiting token.
} elsif ($firstchar eq '"') {
# XXX we don't prohibit bare CR.
s/^(\"($qp_or_bs_end|[^\\"])*\")//o;
} elsif ($firstchar eq '[') {
# XXX we don't prohibit bare CR or '['.
s/^(\[($qp_or_bs_end|[^\\\]])*(\]|$))//o;
} elsif ($firstchar eq '(') {
$comment_depth++ if $2 eq '(';
$comment_depth-- if $2 eq ')';
# XXX error recovery for unterminated comment
} until ($comment_depth == 0);
push (@outtoks, $comment);
} elsif ($firstchar ne '\\' && $firstchar =~ /[$specials]/o) {
# an illegal special character.
# should be an atom, which is not allowed to contain
# special characters or control characters.
# we have already checked for all special chars except
# controls and backslash.
# XXX we don't check for controls.
# XXX we allow a quoted-pair as part of an atom.
s/^(($qp_or_bs_end|[^\s$specials])+)//o;
wantarray ?
@outtoks : &untokenise
(@outtoks);
# Convert a list of tokens to a single string.
# Just pastes the tokens together, with blanks where they are essential.
# Input is a list of tokens.
# Output is a single string.
local ($token, $prevtok);
# A space is essential when both the left and right tokens
# are either atoms or quoted strings.
# XXX - Spaces are desirable in some other places, but for
# now it's too difficult to worry about that. It's
# context-dependent anyway -- for example, we sometimes
# want spaces after ':' and ',', but not when they appear
# inside a route-addr. The tokener has no business knowing
$prev = substr($prevtok, $[, 1);
$this = substr($token, $[, 1);
if ( ($this eq '"' || $this !~ /[$specials]/o)
&& ($prev eq '"' || $prev !~ /[$specials]/o))
# Input can be a single string or a list of tokens.
# In an array context, output is a list of tokens.
# In a scalar context, output is a single string.
# tokenise the input if we were given a single string
@intoks = &tokenise
($intoks[$[]) if $#intoks le $[;
@outtoks = grep (/^[^(]/, @intoks);
wantarray ?
@outtoks : &untokenise
(@outtoks);
# Try to extract a single RFC-822 route-addr or addr-spec from a
# Returns the first route-addr or addr-spec if there are several
# (for example, if the input is a comma-separated list)..
# Garbage in, garbage out.
# Input can be a single string or a list of tokens.
# In an array context, output is a list of tokens.
# In a scalar context, output is a single string.
local ($token, $firstchar);
local ($state) = 'start';
# tokenise the input if we were given a single string
@intoks = &tokenise
($intoks[$[]) if $#intoks le $[;
foreach $token (@intoks) {
$firstchar = substr($token,0,1);
} elsif ($firstchar eq '<') {
# '<' is start of route-addr.
# discard what came before.
} elsif ($firstchar eq ':') {
# ':' might be end of phrase for a group,
# or might be end of route and start of addr-spec in route-addr.
if ($state eq 'routeaddr') {
} elsif ($firstchar eq ',') {
# ',' might be a separator between addresses
# or might be part of a route inside a route-addr.
if ($state eq 'routeaddr') {
last if $#outtoks ge $[; # we got what we wanted
} elsif ($firstchar eq '>') {
# '>' is end of route-addr
last; # we got what we wanted
} elsif ($firstchar eq ';') {
last if $#outtoks ge $[; # we got what we wanted
# accumulate valid tokens.
wantarray ?
@outtoks : &untokenise
(@outtoks);
# Try to extract a single RFC-822 addr-spec from a list of addresses.
# Returns the first addr-spec if there are several.
# Garbage in, garbage out.
# Input can be a single string or a list of tokens.
# In an array context, output is a list of tokens.
# In a scalar context, output is a single string.
local ($i, $startpos, $endpos);
# Get the first route-addr or addr-spec
@intoks = &first_route_addr
(@intoks);
# if starts with '<' then it was a route-addr.
# Keep the stuff between the last ':' (if any) and the first '>'.
if ($intoks[$[] eq '<') {
$startpos = $[+1; # skip the initial '<'
$endpos = $#intoks; # don't yet know if there is a final '>'
foreach $i ($startpos..$endpos) {
} elsif ($token eq ':') {
# if it didn't start with '<' then it was an addr-spec
wantarray ?
@intoks[$startpos..$endpos]
: &untokenise
(@intoks[$startpos..$endpos]);
# Lame attempt at some standalone test code.
# I don't know a good way to tell if we were called from 'require'
# or as a standalone program, so we guess by examining $0.
if ($0 =~ /(^|\/)rfc822\
.pl
$/) {
@toks = &rfc822
'tokenise($string);
print "tokenise:\n\t", join("\n\t", @toks), "\n";
print "untokenise: ", &rfc822'untokenise
(@toks), "\n";
foreach $op ('uncomment', 'first_route_addr', 'first_addr_spec') {
## just test the scalar to scalar version
\
$newstring = &rfc822
'$op(\$string);
print "$op:\t", \$newstring, "\n";
## test all four permutations
## of scalar and array inputs and outputs
# \@newtoks = &rfc822'$op(\
$string);
# print " s-->a:\n\t", join("\n\t", \@newtoks), "\n";
# \$newstring = &rfc822'$op(\$string);
# print " s-->s:\t", \$newstring, "\n";
# \@newtoks = &rfc822'$op(\@toks);
# print " a-->a:\n\t", join("\n\t", \@newtoks), "\n";
# \$newstring = &rfc822'$op(\@toks);
# print " a-->s:\t", \$newstring, "\n";