From 5dc32024862cccac54f76f6f988b4435b0d28986 Mon Sep 17 00:00:00 2001 From: CSRG Date: Wed, 30 Sep 1987 01:52:04 -0800 Subject: [PATCH] BSD 4_3_Tahoe development Work on file usr/src/lib/libc/gen/regexp/regexp.3 Work on file usr/src/lib/libc/gen/regexp/regexp.h Synthesized-from: CSRG/cd2/4.3tahoe --- usr/src/lib/libc/gen/regexp/regexp.3 | 179 +++++++++++++++++++++++++++ usr/src/lib/libc/gen/regexp/regexp.h | 21 ++++ 2 files changed, 200 insertions(+) create mode 100644 usr/src/lib/libc/gen/regexp/regexp.3 create mode 100644 usr/src/lib/libc/gen/regexp/regexp.h diff --git a/usr/src/lib/libc/gen/regexp/regexp.3 b/usr/src/lib/libc/gen/regexp/regexp.3 new file mode 100644 index 0000000000..c5b6b09d10 --- /dev/null +++ b/usr/src/lib/libc/gen/regexp/regexp.3 @@ -0,0 +1,179 @@ +.TH REGEXP 3 local +.DA 30 Nov 1985 +.SH NAME +regcomp, regexec, regsub, regerror \- regular expression handler +.SH SYNOPSIS +.ft B +.nf +#include + +regexp *regcomp(exp) +char *exp; + +int regexec(prog, string) +regexp *prog; +char *string; + +regsub(prog, source, dest) +regexp *prog; +char *source; +char *dest; + +regerror(msg) +char *msg; +.SH DESCRIPTION +These functions implement +.IR egrep (1)-style +regular expressions and supporting facilities. +.PP +.I Regcomp +compiles a regular expression into a structure of type +.IR regexp , +and returns a pointer to it. +The space has been allocated using +.IR malloc (3) +and may be released by +.IR free . +.PP +.I Regexec +matches a NUL-terminated \fIstring\fR against the compiled regular expression +in \fIprog\fR. +It returns 1 for success and 0 for failure, and adjusts the contents of +\fIprog\fR's \fIstartp\fR and \fIendp\fR (see below) accordingly. +.PP +The members of a +.I regexp +structure include at least the following (not necessarily in order): +.PP +.RS +char *startp[NSUBEXP]; +.br +char *endp[NSUBEXP]; +.RE +.PP +where +.I NSUBEXP +is defined (as 10) in the header file. +Once a successful \fIregexec\fR has been done using the \fIregexp\fR, +each \fIstartp\fR-\fIendp\fR pair describes one substring +within the \fIstring\fR, +with the \fIstartp\fR pointing to the first character of the substring and +the \fIendp\fR pointing to the first character following the substring. +The 0th substring is the substring of \fIstring\fR that matched the whole +regular expression. +The others are those substrings that matched parenthesized expressions +within the regular expression, with parenthesized expressions numbered +in left-to-right order of their opening parentheses. +.PP +.I Regsub +copies \fIsource\fR to \fIdest\fR, making substitutions according to the +most recent \fIregexec\fR performed using \fIprog\fR. +Each instance of `&' in \fIsource\fR is replaced by the substring +indicated by \fIstartp\fR[\fI0\fR] and +\fIendp\fR[\fI0\fR]. +Each instance of `\e\fIn\fR', where \fIn\fR is a digit, is replaced by +the substring indicated by +\fIstartp\fR[\fIn\fR] and +\fIendp\fR[\fIn\fR]. +To get a literal `&' or `\e\fIn\fR' into \fIdest\fR, prefix it with `\e'; +to get a literal `\e' preceding `&' or `\e\fIn\fR', prefix it with +another `\e'. +.PP +.I Regerror +is called whenever an error is detected in \fIregcomp\fR, \fIregexec\fR, +or \fIregsub\fR. +The default \fIregerror\fR writes the string \fImsg\fR, +with a suitable indicator of origin, +on the standard +error output +and invokes \fIexit\fR(2). +.I Regerror +can be replaced by the user if other actions are desirable. +.SH "REGULAR EXPRESSION SYNTAX" +A regular expression is zero or more \fIbranches\fR, separated by `|'. +It matches anything that matches one of the branches. +.PP +A branch is zero or more \fIpieces\fR, concatenated. +It matches a match for the first, followed by a match for the second, etc. +.PP +A piece is an \fIatom\fR possibly followed by `*', `+', or `?'. +An atom followed by `*' matches a sequence of 0 or more matches of the atom. +An atom followed by `+' matches a sequence of 1 or more matches of the atom. +An atom followed by `?' matches a match of the atom, or the null string. +.PP +An atom is a regular expression in parentheses (matching a match for the +regular expression), a \fIrange\fR (see below), `.' +(matching any single character), `^' (matching the null string at the +beginning of the input string), `$' (matching the null string at the +end of the input string), a `\e' followed by a single character (matching +that character), or a single character with no other significance +(matching that character). +.PP +A \fIrange\fR is a sequence of characters enclosed in `[]'. +It normally matches any single character from the sequence. +If the sequence begins with `^', +it matches any single character \fInot\fR from the rest of the sequence. +If two characters in the sequence are separated by `\-', this is shorthand +for the full list of ASCII characters between them +(e.g. `[0-9]' matches any decimal digit). +To include a literal `]' in the sequence, make it the first character +(following a possible `^'). +To include a literal `\-', make it the first or last character. +.SH AMBIGUITY +If a regular expression could match two different parts of the input string, +it will match the one which begins earliest. +If both begin in the same place but match different lengths, or match +the same length in different ways, life gets messier, as follows. +.PP +In general, the possibilities in a list of branches are considered in +left-to-right order, the possibilities for `*', `+', and `?' are +considered longest-first, nested constructs are considered from the +outermost in, and concatenated constructs are considered leftmost-first. +The match that will be chosen is the one that uses the earliest +possibility in the first choice that has to be made. +If there is more than one choice, the next will be made in the same manner +(earliest possibility) subject to the decision on the first choice. +And so forth. +.PP +For example, `(ab|a)b*c' could match `abc' in one of two ways. +The first choice is between `ab' and `a'; since `ab' is earlier, and does +lead to a successful overall match, it is chosen. +Since the `b' is already spoken for, +the `b*' must match its last possibility\(emthe empty string\(emsince +it must respect the earlier choice. +.PP +In the particular case where no `|'s are present and there is only one +`*', `+', or `?', the net effect is that the longest possible +match will be chosen. +So `ab*', presented with `xabbbby', will match `abbbb'. +Note that if `ab*' is tried against `xabyabbbz', it +will match `ab' just after `x', due to the begins-earliest rule. +(In effect, the decision on where to start the match is the first choice +to be made, hence subsequent choices must respect it even if this leads them +to less-preferred alternatives.) +.SH SEE ALSO +egrep(1), expr(1) +.SH DIAGNOSTICS +\fIRegcomp\fR returns NULL for a failure +(\fIregerror\fR permitting), +where failures are syntax errors, exceeding implementation limits, +or applying `+' or `*' to a possibly-null operand. +.SH HISTORY +Both code and manual page were +written at U of T. +They are intended to be compatible with the Bell V8 \fIregexp\fR(3), +but are not derived from Bell code. +.SH BUGS +Empty branches and empty regular expressions are not portable to V8. +.PP +The restriction against +applying `*' or `+' to a possibly-null operand is an artifact of the +simplistic implementation. +.PP +Does not support \fIegrep\fR's newline-separated branches; +neither does the V8 \fIregexp\fR(3), though. +.PP +Due to emphasis on +compactness and simplicity, +it's not strikingly fast. +It does give special attention to handling simple cases quickly. diff --git a/usr/src/lib/libc/gen/regexp/regexp.h b/usr/src/lib/libc/gen/regexp/regexp.h new file mode 100644 index 0000000000..73d6bf4124 --- /dev/null +++ b/usr/src/lib/libc/gen/regexp/regexp.h @@ -0,0 +1,21 @@ +/* + * Definitions etc. for regexp(3) routines. + * + * Caveat: this is V8 regexp(3) [actually, a reimplementation thereof], + * not the System V one. + */ +#define NSUBEXP 10 +typedef struct regexp { + char *startp[NSUBEXP]; + char *endp[NSUBEXP]; + char regstart; /* Internal use only. */ + char reganch; /* Internal use only. */ + char *regmust; /* Internal use only. */ + int regmlen; /* Internal use only. */ + char program[1]; /* Unwarranted chumminess with compiler. */ +} regexp; + +extern regexp *regcomp(); +extern int regexec(); +extern void regsub(); +extern void regerror(); -- 2.20.1