git.subgeniuskitty.com - OpenSPARC-T2-SAM/.git/blame_incremental

... / ...

Commit	Line	Data
	1	.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
	2	.\"
	3	.\" Standard preamble:
	4	.\" ========================================================================
	5	.de Sh \" Subsection heading
	6	.br
	7	.if t .Sp
	8	.ne 5
	9	.PP
	10	\fB\\$1\fR
	11	.PP
	12	..
	13	.de Sp \" Vertical space (when we can't use .PP)
	14	.if t .sp .5v
	15	.if n .sp
	16	..
	17	.de Vb \" Begin verbatim text
	18	.ft CW
	19	.nf
	20	.ne \\$1
	21	..
	22	.de Ve \" End verbatim text
	23	.ft R
	24	.fi
	25	..
	26	.\" Set up some character translations and predefined strings. \*(-- will
	27	.\" give an unbreakable dash, \(PI will give pi, \(L" will give a left
	28	.\" double quote, and \*(R" will give a right double quote. \| will give a
	29	.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
	30	.\" do unbreakable dashes and therefore won't be available. \(C` and \(C'
	31	.\" expand to `' in nroff, nothing in troff, for use with C<>.
	32	.tr \(W-\|\(bv\(Tr
	33	.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
	34	.ie n \{\
	35	. ds -- \(*W-
	36	. ds PI pi
	37	. if (\n(.H=4u)&(1m=24u) .ds -- \(W\h'-12u'\(W\h'-12u'-\" diablo 10 pitch
	38	. if (\n(.H=4u)&(1m=20u) .ds -- \(W\h'-12u'\(W\h'-8u'-\" diablo 12 pitch
	39	. ds L" ""
	40	. ds R" ""
	41	. ds C` ""
	42	. ds C' ""
	43	'br\}
	44	.el\{\
	45	. ds -- \\|\(em\\|
	46	. ds PI \(*p
	47	. ds L" ``
	48	. ds R" ''
	49	'br\}
	50	.\"
	51	.\" If the F register is turned on, we'll generate index entries on stderr for
	52	.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
	53	.\" entries marked with X<> in POD. Of course, you'll have to process the
	54	.\" output yourself in some meaningful fashion.
	55	.if \nF \{\
	56	. de IX
	57	. tm Index:\\$1\t\\n%\t"\\$2"
	58	..
	59	. nr % 0
	60	. rr F
	61	.\}
	62	.\"
	63	.\" For nroff, turn off justification. Always turn off hyphenation; it makes
	64	.\" way too many mistakes in technical documents.
	65	.hy 0
	66	.if n .na
	67	.\"
	68	.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
	69	.\" Fear. Run. Save yourself. No user-serviceable parts.
	70	. \" fudge factors for nroff and troff
	71	.if n \{\
	72	. ds #H 0
	73	. ds #V .8m
	74	. ds #F .3m
	75	. ds #[ \f1
	76	. ds #] \fP
	77	.\}
	78	.if t \{\
	79	. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
	80	. ds #V .6m
	81	. ds #F 0
	82	. ds #[ \&
	83	. ds #] \&
	84	.\}
	85	. \" simple accents for nroff and troff
	86	.if n \{\
	87	. ds ' \&
	88	. ds ` \&
	89	. ds ^ \&
	90	. ds , \&
	91	. ds ~ ~
	92	. ds /
	93	.\}
	94	.if t \{\
	95	. ds ' \\k:\h'-(\\n(.wu8/10-\(#H)'\'\h"\|\\n:u"
	96	. ds ` \\k:\h'-(\\n(.wu8/10-\(#H)'\`\h'\|\\n:u'
	97	. ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'^\h'\|\\n:u'
	98	. ds , \\k:\h'-(\\n(.wu*8/10)',\h'\|\\n:u'
	99	. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'\|\\n:u'
	100	. ds / \\k:\h'-(\\n(.wu8/10-\(#H)'\z\(sl\h'\|\\n:u'
	101	.\}
	102	. \" troff and (daisy-wheel) nroff accents
	103	.ds : \\k:\h'-(\\n(.wu8/10-\(#H+.1m+\(#F)'\v'-\(#V'\z.\h'.2m+\(#F'.\h'\|\\n:u'\v'\(#V'
	104	.ds 8 \h'\(#H'\(b\h'-\*(#H'
	105	.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\(#H)/2u'\v'-.3n'\(#[\z\(de\v'.3n'\h'\|\\n:u'\*(#]
	106	.ds d- \h'\(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\(#H'
	107	.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'\|\\n:u'
	108	.ds th \(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u2/3)'\s-1o\s+1\*(#]
	109	.ds Th \(#[\s+2I\s-2\h'-\w'I'u3/5'\v'-.3m'o\v'.3m'\*(#]
	110	.ds ae a\h'-(\w'a'u*4/10)'e
	111	.ds Ae A\h'-(\w'A'u*4/10)'E
	112	. \" corrections for vroff
	113	.if v .ds ~ \\k:\h'-(\\n(.wu9/10-\(#H)'\s-2\u~\d\s+2\h'\|\\n:u'
	114	.if v .ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'\v'-.4m'^\v'.4m'\h'\|\\n:u'
	115	. \" for low resolution devices (crt and lpr)
	116	.if \n(.H>23 .if \n(.V>19 \
	117	\{\
	118	. ds : e
	119	. ds 8 ss
	120	. ds o a
	121	. ds d- d\h'-1'\(ga
	122	. ds D- D\h'-1'\(hy
	123	. ds th \o'bp'
	124	. ds Th \o'LP'
	125	. ds ae ae
	126	. ds Ae AE
	127	.\}
	128	.rm #[ #] #H #V #F C
	129	.\" ========================================================================
	130	.\"
	131	.IX Title "PERLFAQ6 1"
	132	.TH PERLFAQ6 1 "2006-01-07" "perl v5.8.8" "Perl Programmers Reference Guide"
	133	.SH "NAME"
	134	perlfaq6 \- Regular Expressions ($Revision: 1.38 $, $Date: 2005/12/31 00:54:37 $)
	135	.SH "DESCRIPTION"
	136	.IX Header "DESCRIPTION"
	137	This section is surprisingly small because the rest of the \s-1FAQ\s0 is
	138	littered with answers involving regular expressions. For example,
	139	decoding a \s-1URL\s0 and checking whether something is a number are handled
	140	with regular expressions, but those answers are found elsewhere in
	141	this document (in perlfaq9: \*(L"How do I decode or create those %\-encodings
	142	on the web\(R" and perlfaq4: \(L"How do I determine whether a scalar is
	143	a number/whole/integer/float\*(R", to be precise).
	144	.Sh "How can I hope to use regular expressions without creating illegible and unmaintainable code?"
	145	.IX Xref "regex, legibility regexp, legibility regular expression, legibility x"
	146	.IX Subsection "How can I hope to use regular expressions without creating illegible and unmaintainable code?"
	147	Three techniques can make regular expressions maintainable and
	148	understandable.
	149	.IP "Comments Outside the Regex" 4
	150	.IX Item "Comments Outside the Regex"
	151	Describe what you're doing and how you're doing it, using normal Perl
	152	comments.
	153	.Sp
	154	.Vb 3
	155	\& # turn the line into the first word, a colon, and the
	156	\& # number of characters on the rest of the line
	157	\& s/^(\ew+)(.*)/ lc($1) . ":" . length($2) /meg;
	158	.Ve
	159	.IP "Comments Inside the Regex" 4
	160	.IX Item "Comments Inside the Regex"
	161	The \f(CW\(C`/x\(C'\fR modifier causes whitespace to be ignored in a regex pattern
	162	(except in a character class), and also allows you to use normal
	163	comments there, too. As you can imagine, whitespace and comments help
	164	a lot.
	165	.Sp
	166	\&\f(CW\(C`/x\(C'\fR lets you turn this:
	167	.Sp
	168	.Vb 1
	169	\& s{<(?:[^>'"]\|".?"\|'.*?')+>}{}gs;
	170	.Ve
	171	.Sp
	172	into this:
	173	.Sp
	174	.Vb 10
	175	\& s{ < # opening angle bracket
	176	\& (?: # Non-backreffing grouping paren
	177	\& [^>'"] * # 0 or more things that are neither > nor ' nor "
	178	\& \| # or else
	179	\& ".*?" # a section between double quotes (stingy match)
	180	\& \| # or else
	181	\& '.*?' # a section between single quotes (stingy match)
	182	\& ) + # all occurring one or more times
	183	\& > # closing angle bracket
	184	\& }{}gsx; # replace with nothing, i.e. delete
	185	.Ve
	186	.Sp
	187	It's still not quite so clear as prose, but it is very useful for
	188	describing the meaning of each part of the pattern.
	189	.IP "Different Delimiters" 4
	190	.IX Item "Different Delimiters"
	191	While we normally think of patterns as being delimited with \f(CW\(C`/\(C'\fR
	192	characters, they can be delimited by almost any character. perlre
	193	describes this. For example, the \f(CW\(C`s///\(C'\fR above uses braces as
	194	delimiters. Selecting another delimiter can avoid quoting the
	195	delimiter within the pattern:
	196	.Sp
	197	.Vb 2
	198	\& s/\e/usr\e/local/\e/usr\e/share/g; # bad delimiter choice
	199	\& s#/usr/local#/usr/share#g; # better
	200	.Ve
	201	.Sh "I'm having trouble matching over more than one line. What's wrong?"
	202	.IX Xref "regex, multiline regexp, multiline regular expression, multiline"
	203	.IX Subsection "I'm having trouble matching over more than one line. What's wrong?"
	204	Either you don't have more than one line in the string you're looking
	205	at (probably), or else you aren't using the correct modifier(s) on
	206	your pattern (possibly).
	207	.PP
	208	There are many ways to get multiline data into a string. If you want
	209	it to happen automatically while reading input, you'll want to set $/
	210	(probably to '' for paragraphs or \f(CW\(C`undef\(C'\fR for the whole file) to
	211	allow you to read more than one line at a time.
	212	.PP
	213	Read perlre to help you decide which of \f(CW\(C`/s\(C'\fR and \f(CW\(C`/m\(C'\fR (or both)
	214	you might want to use: \f(CW\(C`/s\(C'\fR allows dot to include newline, and \f(CW\(C`/m\(C'\fR
	215	allows caret and dollar to match next to a newline, not just at the
	216	end of the string. You do need to make sure that you've actually
	217	got a multiline string in there.
	218	.PP
	219	For example, this program detects duplicate words, even when they span
	220	line breaks (but not paragraph ones). For this example, we don't need
	221	\&\f(CW\(C`/s\(C'\fR because we aren't using dot in a regular expression that we want
	222	to cross line boundaries. Neither do we need \f(CW\(C`/m\(C'\fR because we aren't
	223	wanting caret or dollar to match at any point inside the record next
	224	to newlines. But it's imperative that $/ be set to something other
	225	than the default, or else we won't actually ever have a multiline
	226	record read in.
	227	.PP
	228	.Vb 6
	229	\& $/ = ''; # read in more whole paragraph, not just one line
	230	\& while ( <> ) {
	231	\& while ( /\eb([\ew'-]+)(\es+\e1)+\eb/gi ) { # word starts alpha
	232	\& print "Duplicate $1 at paragraph $.\en";
	233	\& }
	234	\& }
	235	.Ve
	236	.PP
	237	Here's code that finds sentences that begin with \(L"From \(R" (which would
	238	be mangled by many mailers):
	239	.PP
	240	.Vb 6
	241	\& $/ = ''; # read in more whole paragraph, not just one line
	242	\& while ( <> ) {
	243	\& while ( /^From /gm ) { # /m makes ^ match next to \en
	244	\& print "leading from in paragraph $.\en";
	245	\& }
	246	\& }
	247	.Ve
	248	.PP
	249	Here's code that finds everything between \s-1START\s0 and \s-1END\s0 in a paragraph:
	250	.PP
	251	.Vb 6
	252	\& undef $/; # read in whole file, not just one line or paragraph
	253	\& while ( <> ) {
	254	\& while ( /START(.*?)END/sgm ) { # /s makes . cross line boundaries
	255	\& print "$1\en";
	256	\& }
	257	\& }
	258	.Ve
	259	.Sh "How can I pull out lines between two patterns that are themselves on different lines?"
	260	.IX Xref ".."
	261	.IX Subsection "How can I pull out lines between two patterns that are themselves on different lines?"
	262	You can use Perl's somewhat exotic \f(CW\(C`..\(C'\fR operator (documented in
	263	perlop):
	264	.PP
	265	.Vb 1
	266	\& perl -ne 'print if /START/ .. /END/' file1 file2 ...
	267	.Ve
	268	.PP
	269	If you wanted text and not lines, you would use
	270	.PP
	271	.Vb 1
	272	\& perl -0777 -ne 'print "$1\en" while /START(.*?)END/gs' file1 file2 ...
	273	.Ve
	274	.PP
	275	But if you want nested occurrences of \f(CW\(C`START\(C'\fR through \f(CW\(C`END\(C'\fR, you'll
	276	run up against the problem described in the question in this section
	277	on matching balanced text.
	278	.PP
	279	Here's another example of using \f(CW\(C`..\(C'\fR:
	280	.PP
	281	.Vb 7
	282	\& while (<>) {
	283	\& $in_header = 1 .. /^$/;
	284	\& $in_body = /^$/ .. eof();
	285	\& # now choose between them
	286	\& } continue {
	287	\& reset if eof(); # fix $.
	288	\& }
	289	.Ve
	290	.Sh "I put a regular expression into $/ but it didn't work. What's wrong?"
	291	.IX Xref "$ , regexes in $INPUT_RECORD_SEPARATOR, regexes in $RS, regexes in"
	292	.IX Subsection "I put a regular expression into $/ but it didn't work. What's wrong?"
	293	Up to Perl 5.8.0, $/ has to be a string. This may change in 5.10,
	294	but don't get your hopes up. Until then, you can use these examples
	295	if you really need to do this.
	296	.PP
	297	If you have File::Stream, this is easy.
	298	.PP
	299	.Vb 5
	300	\& use File::Stream;
	301	\& my $stream = File::Stream->new(
	302	\& $filehandle,
	303	\& separator => qr/\es,\es/,
	304	\& );
	305	.Ve
	306	.PP
	307	.Vb 1
	308	\& print "$_\en" while <$stream>;
	309	.Ve
	310	.PP
	311	If you don't have File::Stream, you have to do a little more work.
	312	.PP
	313	You can use the four argument form of sysread to continually add to
	314	a buffer. After you add to the buffer, you check if you have a
	315	complete line (using your regular expression).
	316	.PP
	317	.Vb 7
	318	\& local $_ = "";
	319	\& while( sysread FH, $_, 8192, length ) {
	320	\& while( s/^((?s).*?)your_pattern/ ) {
	321	\& my $record = $1;
	322	\& # do stuff here.
	323	\& }
	324	\& }
	325	.Ve
	326	.PP
	327	.Vb 3
	328	\& You can do the same thing with foreach and a match using the
	329	\& c flag and the \eG anchor, if you do not mind your entire file
	330	\& being in memory at the end.
	331	.Ve
	332	.PP
	333	.Vb 7
	334	\& local $_ = "";
	335	\& while( sysread FH, $_, 8192, length ) {
	336	\& foreach my $record ( m/\eG((?s).*?)your_pattern/gc ) {
	337	\& # do stuff here.
	338	\& }
	339	\& substr( $_, 0, pos ) = "" if pos;
	340	\& }
	341	.Ve
	342	.Sh "How do I substitute case insensitively on the \s-1LHS\s0 while preserving case on the \s-1RHS\s0?"
	343	.IX Xref "replace, case preserving substitute, case preserving substitution, case preserving s, case preserving"
	344	.IX Subsection "How do I substitute case insensitively on the LHS while preserving case on the RHS?"
	345	Here's a lovely Perlish solution by Larry Rosler. It exploits
	346	properties of bitwise xor on \s-1ASCII\s0 strings.
	347	.PP
	348	.Vb 1
	349	\& $_= "this is a TEsT case";
	350	.Ve
	351	.PP
	352	.Vb 2
	353	\& $old = 'test';
	354	\& $new = 'success';
	355	.Ve
	356	.PP
	357	.Vb 5
	358	\& s{(\eQ$old\eE)}
	359	\& { uc $new \| (uc $1 ^ $1) .
	360	\& (uc(substr $1, -1) ^ substr $1, -1) x
	361	\& (length($new) - length $1)
	362	\& }egi;
	363	.Ve
	364	.PP
	365	.Vb 1
	366	\& print;
	367	.Ve
	368	.PP
	369	And here it is as a subroutine, modeled after the above:
	370	.PP
	371	.Vb 3
	372	\& sub preserve_case($$) {
	373	\& my ($old, $new) = @_;
	374	\& my $mask = uc $old ^ $old;
	375	.Ve
	376	.PP
	377	.Vb 3
	378	\& uc $new \| $mask .
	379	\& substr($mask, -1) x (length($new) - length($old))
	380	\& }
	381	.Ve
	382	.PP
	383	.Vb 3
	384	\& $a = "this is a TEsT case";
	385	\& $a =~ s/(test)/preserve_case($1, "success")/egi;
	386	\& print "$a\en";
	387	.Ve
	388	.PP
	389	This prints:
	390	.PP
	391	.Vb 1
	392	\& this is a SUcCESS case
	393	.Ve
	394	.PP
	395	As an alternative, to keep the case of the replacement word if it is
	396	longer than the original, you can use this code, by Jeff Pinyan:
	397	.PP
	398	.Vb 3
	399	\& sub preserve_case {
	400	\& my ($from, $to) = @_;
	401	\& my ($lf, $lt) = map length, @_;
	402	.Ve
	403	.PP
	404	.Vb 2
	405	\& if ($lt < $lf) { $from = substr $from, 0, $lt }
	406	\& else { $from .= substr $to, $lf }
	407	.Ve
	408	.PP
	409	.Vb 2
	410	\& return uc $to \| ($from ^ uc $from);
	411	\& }
	412	.Ve
	413	.PP
	414	This changes the sentence to \(L"this is a SUcCess case.\(R"
	415	.PP
	416	Just to show that C programmers can write C in any programming language,
	417	if you prefer a more C\-like solution, the following script makes the
	418	substitution have the same case, letter by letter, as the original.
	419	(It also happens to run about 240% slower than the Perlish solution runs.)
	420	If the substitution has more characters than the string being substituted,
	421	the case of the last character is used for the rest of the substitution.
	422	.PP
	423	.Vb 8
	424	\& # Original by Nathan Torkington, massaged by Jeffrey Friedl
	425	\& #
	426	\& sub preserve_case($$)
	427	\& {
	428	\& my ($old, $new) = @_;
	429	\& my ($state) = 0; # 0 = no change; 1 = lc; 2 = uc
	430	\& my ($i, $oldlen, $newlen, $c) = (0, length($old), length($new));
	431	\& my ($len) = $oldlen < $newlen ? $oldlen : $newlen;
	432	.Ve
	433	.PP
	434	.Vb 21
	435	\& for ($i = 0; $i < $len; $i++) {
	436	\& if ($c = substr($old, $i, 1), $c =~ /[\eW\ed_]/) {
	437	\& $state = 0;
	438	\& } elsif (lc $c eq $c) {
	439	\& substr($new, $i, 1) = lc(substr($new, $i, 1));
	440	\& $state = 1;
	441	\& } else {
	442	\& substr($new, $i, 1) = uc(substr($new, $i, 1));
	443	\& $state = 2;
	444	\& }
	445	\& }
	446	\& # finish up with any remaining new (for when new is longer than old)
	447	\& if ($newlen > $oldlen) {
	448	\& if ($state == 1) {
	449	\& substr($new, $oldlen) = lc(substr($new, $oldlen));
	450	\& } elsif ($state == 2) {
	451	\& substr($new, $oldlen) = uc(substr($new, $oldlen));
	452	\& }
	453	\& }
	454	\& return $new;
	455	\& }
	456	.Ve
	457	.ie n .Sh "How can I make ""\ew"" match national character sets?"
	458	.el .Sh "How can I make \f(CW\ew\fP match national character sets?"
	459	.IX Xref "\w"
	460	.IX Subsection "How can I make w match national character sets?"
	461	Put \f(CW\(C`use locale;\(C'\fR in your script. The \ew character class is taken
	462	from the current locale.
	463	.PP
	464	See perllocale for details.
	465	.ie n .Sh "How can I match a locale-smart version of ""/[a\-zA\-Z]/""?"
	466	.el .Sh "How can I match a locale-smart version of \f(CW/[a\-zA\-Z]/\fP?"
	467	.IX Xref "alpha"
	468	.IX Subsection "How can I match a locale-smart version of /[a-zA-Z]/?"
	469	You can use the \s-1POSIX\s0 character class syntax \f(CW\(C`/[[:alpha:]]/\(C'\fR
	470	documented in perlre.
	471	.PP
	472	No matter which locale you are in, the alphabetic characters are
	473	the characters in \ew without the digits and the underscore.
	474	As a regex, that looks like \f(CW\(C`/[^\eW\ed_]/\(C'\fR. Its complement,
	475	the non\-alphabetics, is then everything in \eW along with
	476	the digits and the underscore, or \f(CW\(C`/[\eW\ed_]/\(C'\fR.
	477	.Sh "How can I quote a variable to use in a regex?"
	478	.IX Xref "regex, escaping regexp, escaping regular expression, escaping"
	479	.IX Subsection "How can I quote a variable to use in a regex?"
	480	The Perl parser will expand \f(CW$variable\fR and \f(CW@variable\fR references in
	481	regular expressions unless the delimiter is a single quote. Remember,
	482	too, that the right-hand side of a \f(CW\(C`s///\(C'\fR substitution is considered
	483	a double-quoted string (see perlop for more details). Remember
	484	also that any regex special characters will be acted on unless you
	485	precede the substitution with \eQ. Here's an example:
	486	.PP
	487	.Vb 2
	488	\& $string = "Placido P. Octopus";
	489	\& $regex = "P.";
	490	.Ve
	491	.PP
	492	.Vb 2
	493	\& $string =~ s/$regex/Polyp/;
	494	\& # $string is now "Polypacido P. Octopus"
	495	.Ve
	496	.PP
	497	Because \f(CW\(C`.\(C'\fR is special in regular expressions, and can match any
	498	single character, the regex \f(CW\(C`P.\(C'\fR here has matched the <Pl> in the
	499	original string.
	500	.PP
	501	To escape the special meaning of \f(CW\(C`.\(C'\fR, we use \f(CW\(C`\eQ\(C'\fR:
	502	.PP
	503	.Vb 2
	504	\& $string = "Placido P. Octopus";
	505	\& $regex = "P.";
	506	.Ve
	507	.PP
	508	.Vb 2
	509	\& $string =~ s/\eQ$regex/Polyp/;
	510	\& # $string is now "Placido Polyp Octopus"
	511	.Ve
	512	.PP
	513	The use of \f(CW\(C`\eQ\(C'\fR causes the <.> in the regex to be treated as a
	514	regular character, so that \f(CW\(C`P.\(C'\fR matches a \f(CW\(C`P\(C'\fR followed by a dot.
	515	.ie n .Sh "What is ""/o"" really for?"
	516	.el .Sh "What is \f(CW/o\fP really for?"
	517	.IX Xref " o"
	518	.IX Subsection "What is /o really for?"
	519	Using a variable in a regular expression match forces a re-evaluation
	520	(and perhaps recompilation) each time the regular expression is
	521	encountered. The \f(CW\(C`/o\(C'\fR modifier locks in the regex the first time
	522	it's used. This always happens in a constant regular expression, and
	523	in fact, the pattern was compiled into the internal format at the same
	524	time your entire program was.
	525	.PP
	526	Use of \f(CW\(C`/o\(C'\fR is irrelevant unless variable interpolation is used in
	527	the pattern, and if so, the regex engine will neither know nor care
	528	whether the variables change after the pattern is evaluated the \fIvery
	529	first\fR time.
	530	.PP
	531	\&\f(CW\(C`/o\(C'\fR is often used to gain an extra measure of efficiency by not
	532	performing subsequent evaluations when you know it won't matter
	533	(because you know the variables won't change), or more rarely, when
	534	you don't want the regex to notice if they do.
	535	.PP
	536	For example, here's a \(L"paragrep\(R" program:
	537	.PP
	538	.Vb 5
	539	\& $/ = ''; # paragraph mode
	540	\& $pat = shift;
	541	\& while (<>) {
	542	\& print if /$pat/o;
	543	\& }
	544	.Ve
	545	.Sh "How do I use a regular expression to strip C style comments from a file?"
	546	.IX Subsection "How do I use a regular expression to strip C style comments from a file?"
	547	While this actually can be done, it's much harder than you'd think.
	548	For example, this one-liner
	549	.PP
	550	.Vb 1
	551	\& perl -0777 -pe 's{/\e.?\e*/}{}gs' foo.c
	552	.Ve
	553	.PP
	554	will work in many but not all cases. You see, it's too simple-minded for
	555	certain kinds of C programs, in particular, those with what appear to be
	556	comments in quoted strings. For that, you'd need something like this,
	557	created by Jeffrey Friedl and later modified by Fred Curtis.
	558	.PP
	559	.Vb 4
	560	\& $/ = undef;
	561	\& $_ = <>;
	562	\& s#/\e[^]\e+([^/][^]\e+)/\|("(\e\e.\|[^"\e\e])"\|'(\e\e.\|[^'\e\e])'\|.[^/"'\e\e])#defined $2 ? $2 : ""#gse;
	563	\& print;
	564	.Ve
	565	.PP
	566	This could, of course, be more legibly written with the \f(CW\(C`/x\(C'\fR modifier, adding
	567	whitespace and comments. Here it is expanded, courtesy of Fred Curtis.
	568	.PP
	569	.Vb 8
	570	\& s{
	571	\& /\e* ## Start of /* ... */ comment
	572	\& [^]\e+ ## Non- followed by 1-or-more *'s
	573	\& (
	574	\& [^/][^]\e+
	575	\& )* ## 0-or-more things which don't start with /
	576	\& ## but do end with '*'
	577	\& / ## End of /* ... */ comment
	578	.Ve
	579	.PP
	580	.Vb 1
	581	\& \| ## OR various things which aren't comments:
	582	.Ve
	583	.PP
	584	.Vb 8
	585	\& (
	586	\& " ## Start of " ... " string
	587	\& (
	588	\& \e\e. ## Escaped char
	589	\& \| ## OR
	590	\& [^"\e\e] ## Non "\e
	591	\& )*
	592	\& " ## End of " ... " string
	593	.Ve
	594	.PP
	595	.Vb 1
	596	\& \| ## OR
	597	.Ve
	598	.PP
	599	.Vb 7
	600	\& ' ## Start of ' ... ' string
	601	\& (
	602	\& \e\e. ## Escaped char
	603	\& \| ## OR
	604	\& [^'\e\e] ## Non '\e
	605	\& )*
	606	\& ' ## End of ' ... ' string
	607	.Ve
	608	.PP
	609	.Vb 1
	610	\& \| ## OR
	611	.Ve
	612	.PP
	613	.Vb 4
	614	\& . ## Anything other char
	615	\& [^/"'\e\e]* ## Chars which doesn't start a comment, string or escape
	616	\& )
	617	\& }{defined $2 ? $2 : ""}gxse;
	618	.Ve
	619	.PP
	620	A slight modification also removes \*(C+ comments:
	621	.PP
	622	.Vb 1
	623	\& s#/\e[^]\e+([^/][^]\e+)/\|//[^\en]\|("(\e\e.\|[^"\e\e])"\|'(\e\e.\|[^'\e\e])'\|.[^/"'\e\e]*)#defined $2 ? $2 : ""#gse;
	624	.Ve
	625	.Sh "Can I use Perl regular expressions to match balanced text?"
	626	.IX Xref "regex, matching balanced test regexp, matching balanced test regular expression, matching balanced test"
	627	.IX Subsection "Can I use Perl regular expressions to match balanced text?"
	628	Historically, Perl regular expressions were not capable of matching
	629	balanced text. As of more recent versions of perl including 5.6.1
	630	experimental features have been added that make it possible to do this.
	631	Look at the documentation for the (??{ }) construct in recent perlre manual
	632	pages to see an example of matching balanced parentheses. Be sure to take
	633	special notice of the warnings present in the manual before making use
	634	of this feature.
	635	.PP
	636	\&\s-1CPAN\s0 contains many modules that can be useful for matching text
	637	depending on the context. Damian Conway provides some useful
	638	patterns in Regexp::Common. The module Text::Balanced provides a
	639	general solution to this problem.
	640	.PP
	641	One of the common applications of balanced text matching is working
	642	with \s-1XML\s0 and \s-1HTML\s0. There are many modules available that support
	643	these needs. Two examples are HTML::Parser and XML::Parser. There
	644	are many others.
	645	.PP
	646	An elaborate subroutine (for 7\-bit \s-1ASCII\s0 only) to pull out balanced
	647	and possibly nested single chars, like \f(CW\(C``\(C'\fR and \f(CW\(C`'\(C'\fR, \f(CW\(C`{\(C'\fR and \f(CW\(C`}\(C'\fR,
	648	or \f(CW\(C`(\(C'\fR and \f(CW\(C`)\(C'\fR can be found in
	649	http://www.cpan.org/authors/id/TOMC/scripts/pull_quotes.gz .
	650	.PP
	651	The C::Scan module from \s-1CPAN\s0 also contains such subs for internal use,
	652	but they are undocumented.
	653	.Sh "What does it mean that regexes are greedy? How can I get around it?"
	654	.IX Xref "greedy greediness"
	655	.IX Subsection "What does it mean that regexes are greedy? How can I get around it?"
	656	Most people mean that greedy regexes match as much as they can.
	657	Technically speaking, it's actually the quantifiers (\f(CW\(C`?\(C'\fR, \f(CW\(C`\(C'\fR, \f(CW\(C`+\*(C'\fR,
	658	\&\f(CW\(C`{}\(C'\fR) that are greedy rather than the whole pattern; Perl prefers local
	659	greed and immediate gratification to overall greed. To get non-greedy
	660	versions of the same quantifiers, use (\f(CW\(C`??\(C'\fR, \f(CW\(C`?\(C'\fR, \f(CW\(C`+?\(C'\fR, \f(CW\(C`{}?\*(C'\fR).
	661	.PP
	662	An example:
	663	.PP
	664	.Vb 3
	665	\& $s1 = $s2 = "I am very very cold";
	666	\& $s1 =~ s/ve.*y //; # I am cold
	667	\& $s2 =~ s/ve.*?y //; # I am very cold
	668	.Ve
	669	.PP
	670	Notice how the second substitution stopped matching as soon as it
	671	encountered \(L"y \(R". The \f(CW\(C`?\*(C'\fR quantifier effectively tells the regular
	672	expression engine to find a match as quickly as possible and pass
	673	control on to whatever is next in line, like you would if you were
	674	playing hot potato.
	675	.Sh "How do I process each word on each line?"
	676	.IX Xref "word"
	677	.IX Subsection "How do I process each word on each line?"
	678	Use the split function:
	679	.PP
	680	.Vb 5
	681	\& while (<>) {
	682	\& foreach $word ( split ) {
	683	\& # do something with $word here
	684	\& }
	685	\& }
	686	.Ve
	687	.PP
	688	Note that this isn't really a word in the English sense; it's just
	689	chunks of consecutive non-whitespace characters.
	690	.PP
	691	To work with only alphanumeric sequences (including underscores), you
	692	might consider
	693	.PP
	694	.Vb 5
	695	\& while (<>) {
	696	\& foreach $word (m/(\ew+)/g) {
	697	\& # do something with $word here
	698	\& }
	699	\& }
	700	.Ve
	701	.Sh "How can I print out a word-frequency or line-frequency summary?"
	702	.IX Subsection "How can I print out a word-frequency or line-frequency summary?"
	703	To do this, you have to parse out each word in the input stream. We'll
	704	pretend that by word you mean chunk of alphabetics, hyphens, or
	705	apostrophes, rather than the non-whitespace chunk idea of a word given
	706	in the previous question:
	707	.PP
	708	.Vb 8
	709	\& while (<>) {
	710	\& while ( /(\eb[^\eW_\ed][\ew'-]+\eb)/g ) { # misses "`sheep'"
	711	\& $seen{$1}++;
	712	\& }
	713	\& }
	714	\& while ( ($word, $count) = each %seen ) {
	715	\& print "$count $word\en";
	716	\& }
	717	.Ve
	718	.PP
	719	If you wanted to do the same thing for lines, you wouldn't need a
	720	regular expression:
	721	.PP
	722	.Vb 6
	723	\& while (<>) {
	724	\& $seen{$_}++;
	725	\& }
	726	\& while ( ($line, $count) = each %seen ) {
	727	\& print "$count $line";
	728	\& }
	729	.Ve
	730	.PP
	731	If you want these output in a sorted order, see perlfaq4: \*(L"How do I
	732	sort a hash (optionally by value instead of key)?\*(R".
	733	.Sh "How can I do approximate matching?"
	734	.IX Xref "match, approximate matching, approximate"
	735	.IX Subsection "How can I do approximate matching?"
	736	See the module String::Approx available from \s-1CPAN\s0.
	737	.Sh "How do I efficiently match many regular expressions at once?"
	738	.IX Xref "regex, efficiency regexp, efficiency regular expression, efficiency"
	739	.IX Subsection "How do I efficiently match many regular expressions at once?"
	740	( contributed by brian d foy )
	741	.PP
	742	Avoid asking Perl to compile a regular expression every time
	743	you want to match it. In this example, perl must recompile
	744	the regular expression for every iteration of the \fIforeach()\fR
	745	loop since it has no way to know what \f(CW$pattern\fR will be.
	746	.PP
	747	.Vb 1
	748	\& @patterns = qw( foo bar baz );
	749	.Ve
	750	.PP
	751	.Vb 8
	752	\& LINE: while( <> )
	753	\& {
	754	\& foreach $pattern ( @patterns )
	755	\& {
	756	\& print if /\eb$pattern\eb/i;
	757	\& next LINE;
	758	\& }
	759	\& }
	760	.Ve
	761	.PP
	762	The qr// operator showed up in perl 5.005. It compiles a
	763	regular expression, but doesn't apply it. When you use the
	764	pre-compiled version of the regex, perl does less work. In
	765	this example, I inserted a \fImap()\fR to turn each pattern into
	766	its pre-compiled form. The rest of the script is the same,
	767	but faster.
	768	.PP
	769	.Vb 1
	770	\& @patterns = map { qr/\eb$_\eb/i } qw( foo bar baz );
	771	.Ve
	772	.PP
	773	.Vb 8
	774	\& LINE: while( <> )
	775	\& {
	776	\& foreach $pattern ( @patterns )
	777	\& {
	778	\& print if /\eb$pattern\eb/i;
	779	\& next LINE;
	780	\& }
	781	\& }
	782	.Ve
	783	.PP
	784	In some cases, you may be able to make several patterns into
	785	a single regular expression. Beware of situations that require
	786	backtracking though.
	787	.PP
	788	.Vb 1
	789	\& $regex = join '\|', qw( foo bar baz );
	790	.Ve
	791	.PP
	792	.Vb 4
	793	\& LINE: while( <> )
	794	\& {
	795	\& print if /\eb(?:$regex)\eb/i;
	796	\& }
	797	.Ve
	798	.PP
	799	For more details on regular expression efficiency, see Mastering
	800	Regular Expressions by Jeffrey Freidl. He explains how regular
	801	expressions engine work and why some patterns are surprisingly
	802	inefficient. Once you understand how perl applies regular
	803	expressions, you can tune them for individual situations.
	804	.ie n .Sh "Why don't word-boundary searches with ""\eb"" work for me?"
	805	.el .Sh "Why don't word-boundary searches with \f(CW\eb\fP work for me?"
	806	.IX Xref "\b"
	807	.IX Subsection "Why don't word-boundary searches with b work for me?"
	808	(contributed by brian d foy)
	809	.PP
	810	Ensure that you know what \eb really does: it's the boundary between a
	811	word character, \ew, and something that isn't a word character. That
	812	thing that isn't a word character might be \eW, but it can also be the
	813	start or end of the string.
	814	.PP
	815	It's not (not!) the boundary between whitespace and non\-whitespace,
	816	and it's not the stuff between words we use to create sentences.
	817	.PP
	818	In regex speak, a word boundary (\eb) is a \(L"zero width assertion\(R",
	819	meaning that it doesn't represent a character in the string, but a
	820	condition at a certain position.
	821	.PP
	822	For the regular expression, /\ebPerl\eb/, there has to be a word
	823	boundary before the \(L"P\(R" and after the \(L"l\(R". As long as something other
	824	than a word character precedes the \(L"P\(R" and succeeds the \(L"l\(R", the
	825	pattern will match. These strings match /\ebPerl\eb/.
	826	.PP
	827	.Vb 4
	828	\& "Perl" # no word char before P or after l
	829	\& "Perl " # same as previous (space is not a word char)
	830	\& "'Perl'" # the ' char is not a word char
	831	\& "Perl's" # no word char before P, non-word char after "l"
	832	.Ve
	833	.PP
	834	These strings do not match /\ebPerl\eb/.
	835	.PP
	836	.Vb 2
	837	\& "Perl_" # _ is a word char!
	838	\& "Perler" # no word char before P, but one after l
	839	.Ve
	840	.PP
	841	You don't have to use \eb to match words though. You can look for
	842	non-word characters surrounded by word characters. These strings
	843	match the pattern /\eb'\eb/.
	844	.PP
	845	.Vb 2
	846	\& "don't" # the ' char is surrounded by "n" and "t"
	847	\& "qep'a'" # the ' char is surrounded by "p" and "a"
	848	.Ve
	849	.PP
	850	These strings do not match /\eb'\eb/.
	851	.PP
	852	.Vb 1
	853	\& "foo'" # there is no word char after non-word '
	854	.Ve
	855	.PP
	856	You can also use the complement of \eb, \eB, to specify that there
	857	should not be a word boundary.
	858	.PP
	859	In the pattern /\eBam\eB/, there must be a word character before the \(L"a\(R"
	860	and after the \(L"m\(R". These patterns match /\eBam\eB/:
	861	.PP
	862	.Vb 2
	863	\& "llama" # "am" surrounded by word chars
	864	\& "Samuel" # same
	865	.Ve
	866	.PP
	867	These strings do not match /\eBam\eB/
	868	.PP
	869	.Vb 2
	870	\& "Sam" # no word boundary before "a", but one after "m"
	871	\& "I am Sam" # "am" surrounded by non-word chars
	872	.Ve
	873	.Sh "Why does using $&, $`, or $' slow my program down?"
	874	.IX Xref "$MATCH $& $POSTMATCH $' $PREMATCH $`"
	875	.IX Subsection "Why does using $&, $`, or $' slow my program down?"
	876	(contributed by Anno Siegel)
	877	.PP
	878	Once Perl sees that you need one of these variables anywhere in the
	879	program, it provides them on each and every pattern match. That means
	880	that on every pattern match the entire string will be copied, part of it
	881	to $`, part to $&, and part to $'. Thus the penalty is most severe with
	882	long strings and patterns that match often. Avoid $&, $', and $` if you
	883	can, but if you can't, once you've used them at all, use them at will
	884	because you've already paid the price. Remember that some algorithms
	885	really appreciate them. As of the 5.005 release, the $& variable is no
	886	longer \(L"expensive\(R" the way the other two are.
	887	.PP
	888	Since Perl 5.6.1 the special variables @\- and @+ can functionally replace
	889	$`, $& and $'. These arrays contain pointers to the beginning and end
	890	of each match (see perlvar for the full story), so they give you
	891	essentially the same information, but without the risk of excessive
	892	string copying.
	893	.ie n .Sh "What good is ""\eG"" in a regular expression?"
	894	.el .Sh "What good is \f(CW\eG\fP in a regular expression?"
	895	.IX Xref "\G"
	896	.IX Subsection "What good is G in a regular expression?"
	897	You use the \f(CW\(C`\eG\(C'\fR anchor to start the next match on the same
	898	string where the last match left off. The regular
	899	expression engine cannot skip over any characters to find
	900	the next match with this anchor, so \f(CW\(C`\eG\(C'\fR is similar to the
	901	beginning of string anchor, \f(CW\(C`^\(C'\fR. The \f(CW\(C`\eG\(C'\fR anchor is typically
	902	used with the \f(CW\(C`g\(C'\fR flag. It uses the value of \fIpos()\fR
	903	as the position to start the next match. As the match
	904	operator makes successive matches, it updates \fIpos()\fR with the
	905	position of the next character past the last match (or the
	906	first character of the next match, depending on how you like
	907	to look at it). Each string has its own \fIpos()\fR value.
	908	.PP
	909	Suppose you want to match all of consective pairs of digits
	910	in a string like \(L"1122a44\(R" and stop matching when you
	911	encounter non\-digits. You want to match \f(CW11\fR and \f(CW22\fR but
	912	the letter <a> shows up between \f(CW22\fR and \f(CW44\fR and you want
	913	to stop at \f(CW\(C`a\(C'\fR. Simply matching pairs of digits skips over
	914	the \f(CW\(C`a\(C'\fR and still matches \f(CW44\fR.
	915	.PP
	916	.Vb 2
	917	\& $_ = "1122a44";
	918	\& my @pairs = m/(\ed\ed)/g; # qw( 11 22 44 )
	919	.Ve
	920	.PP
	921	If you use the \eG anchor, you force the match after \f(CW22\fR to
	922	start with the \f(CW\(C`a\(C'\fR. The regular expression cannot match
	923	there since it does not find a digit, so the next match
	924	fails and the match operator returns the pairs it already
	925	found.
	926	.PP
	927	.Vb 2
	928	\& $_ = "1122a44";
	929	\& my @pairs = m/\eG(\ed\ed)/g; # qw( 11 22 )
	930	.Ve
	931	.PP
	932	You can also use the \f(CW\(C`\eG\(C'\fR anchor in scalar context. You
	933	still need the \f(CW\(C`g\(C'\fR flag.
	934	.PP
	935	.Vb 5
	936	\& $_ = "1122a44";
	937	\& while( m/\eG(\ed\ed)/g )
	938	\& {
	939	\& print "Found $1\en";
	940	\& }
	941	.Ve
	942	.PP
	943	After the match fails at the letter \f(CW\(C`a\(C'\fR, perl resets \fIpos()\fR
	944	and the next match on the same string starts at the beginning.
	945	.PP
	946	.Vb 5
	947	\& $_ = "1122a44";
	948	\& while( m/\eG(\ed\ed)/g )
	949	\& {
	950	\& print "Found $1\en";
	951	\& }
	952	.Ve
	953	.PP
	954	.Vb 1
	955	\& print "Found $1 after while" if m/(\ed\ed)/g; # finds "11"
	956	.Ve
	957	.PP
	958	You can disable \fIpos()\fR resets on fail with the \f(CW\(C`c\(C'\fR flag.
	959	Subsequent matches start where the last successful match
	960	ended (the value of \fIpos()\fR) even if a match on the same
	961	string as failed in the meantime. In this case, the match
	962	after the \fIwhile()\fR loop starts at the \f(CW\(C`a\(C'\fR (where the last
	963	match stopped), and since it does not use any anchor it can
	964	skip over the \f(CW\(C`a\(C'\fR to find \(L"44\(R".
	965	.PP
	966	.Vb 5
	967	\& $_ = "1122a44";
	968	\& while( m/\eG(\ed\ed)/gc )
	969	\& {
	970	\& print "Found $1\en";
	971	\& }
	972	.Ve
	973	.PP
	974	.Vb 1
	975	\& print "Found $1 after while" if m/(\ed\ed)/g; # finds "44"
	976	.Ve
	977	.PP
	978	Typically you use the \f(CW\(C`\eG\(C'\fR anchor with the \f(CW\(C`c\(C'\fR flag
	979	when you want to try a different match if one fails,
	980	such as in a tokenizer. Jeffrey Friedl offers this example
	981	which works in 5.004 or later.
	982	.PP
	983	.Vb 9
	984	\& while (<>) {
	985	\& chomp;
	986	\& PARSER: {
	987	\& m/ \eG( \ed+\eb )/gcx && do { print "number: $1\en"; redo; };
	988	\& m/ \eG( \ew+ )/gcx && do { print "word: $1\en"; redo; };
	989	\& m/ \eG( \es+ )/gcx && do { print "space: $1\en"; redo; };
	990	\& m/ \eG( [^\ew\ed]+ )/gcx && do { print "other: $1\en"; redo; };
	991	\& }
	992	\& }
	993	.Ve
	994	.PP
	995	For each line, the \s-1PARSER\s0 loop first tries to match a series
	996	of digits followed by a word boundary. This match has to
	997	start at the place the last match left off (or the beginning
	998	of the string on the first match). Since \f(CW\*(C`m/ \eG( \ed+\eb
	999	)/gcx\(C'\fR uses the \f(CW\(C`c\*(C'\fR flag, if the string does not match that
	1000	regular expression, perl does not reset \fIpos()\fR and the next
	1001	match starts at the same position to try a different
	1002	pattern.
	1003	.Sh "Are Perl regexes DFAs or NFAs? Are they \s-1POSIX\s0 compliant?"
	1004	.IX Xref "DFA NFA POSIX"
	1005	.IX Subsection "Are Perl regexes DFAs or NFAs? Are they POSIX compliant?"
	1006	While it's true that Perl's regular expressions resemble the DFAs
	1007	(deterministic finite automata) of the \fIegrep\fR\\|(1) program, they are in
	1008	fact implemented as NFAs (non\-deterministic finite automata) to allow
	1009	backtracking and backreferencing. And they aren't POSIX-style either,
	1010	because those guarantee worst-case behavior for all cases. (It seems
	1011	that some people prefer guarantees of consistency, even when what's
	1012	guaranteed is slowness.) See the book \(L"Mastering Regular Expressions\(R"
	1013	(from O'Reilly) by Jeffrey Friedl for all the details you could ever
	1014	hope to know on these matters (a full citation appears in
	1015	perlfaq2).
	1016	.Sh "What's wrong with using grep in a void context?"
	1017	.IX Xref "grep"
	1018	.IX Subsection "What's wrong with using grep in a void context?"
	1019	The problem is that grep builds a return list, regardless of the context.
	1020	This means you're making Perl go to the trouble of building a list that
	1021	you then just throw away. If the list is large, you waste both time and space.
	1022	If your intent is to iterate over the list, then use a for loop for this
	1023	purpose.
	1024	.PP
	1025	In perls older than 5.8.1, map suffers from this problem as well.
	1026	But since 5.8.1, this has been fixed, and map is context aware \- in void
	1027	context, no lists are constructed.
	1028	.Sh "How can I match strings with multibyte characters?"
	1029	.IX Xref "regex, and multibyte characters regexp, and multibyte characters regular expression, and multibyte characters"
	1030	.IX Subsection "How can I match strings with multibyte characters?"
	1031	Starting from Perl 5.6 Perl has had some level of multibyte character
	1032	support. Perl 5.8 or later is recommended. Supported multibyte
	1033	character repertoires include Unicode, and legacy encodings
	1034	through the Encode module. See perluniintro, perlunicode,
	1035	and Encode.
	1036	.PP
	1037	If you are stuck with older Perls, you can do Unicode with the
	1038	\&\f(CW\(C`Unicode::String\(C'\fR module, and character conversions using the
	1039	\&\f(CW\(C`Unicode::Map8\(C'\fR and \f(CW\(C`Unicode::Map\(C'\fR modules. If you are using
	1040	Japanese encodings, you might try using the jperl 5.005_03.
	1041	.PP
	1042	Finally, the following set of approaches was offered by Jeffrey
	1043	Friedl, whose article in issue #5 of The Perl Journal talks about
	1044	this very matter.
	1045	.PP
	1046	Let's suppose you have some weird Martian encoding where pairs of
	1047	\&\s-1ASCII\s0 uppercase letters encode single Martian letters (i.e. the two
	1048	bytes \(L"\s-1CV\s0\(R" make a single Martian letter, as do the two bytes \(L"\s-1SG\s0\(R",
	1049	\&\(L"\s-1VS\s0\(R", \(L"\s-1XX\s0\(R", etc.). Other bytes represent single characters, just like
	1050	\&\s-1ASCII\s0.
	1051	.PP
	1052	So, the string of Martian \(L"I am \s-1CVSGXX\s0!\(R" uses 12 bytes to encode the
	1053	nine characters 'I', ' ', 'a', 'm', ' ', '\s-1CV\s0', '\s-1SG\s0', '\s-1XX\s0', '!'.
	1054	.PP
	1055	Now, say you want to search for the single character \f(CW\(C`/GX/\(C'\fR. Perl
	1056	doesn't know about Martian, so it'll find the two bytes \(L"\s-1GX\s0\(R" in the \*(L"I
	1057	am \s-1CVSGXX\s0!\*(R" string, even though that character isn't there: it just
	1058	looks like it is because \(L"\s-1SG\s0\(R" is next to \(L"\s-1XX\s0\(R", but there's no real
	1059	\&\(L"\s-1GX\s0\(R". This is a big problem.
	1060	.PP
	1061	Here are a few ways, all painful, to deal with it:
	1062	.PP
	1063	.Vb 3
	1064	\& $martian =~ s/([A-Z][A-Z])/ $1 /g; # Make sure adjacent "martian"
	1065	\& # bytes are no longer adjacent.
	1066	\& print "found GX!\en" if $martian =~ /GX/;
	1067	.Ve
	1068	.PP
	1069	Or like this:
	1070	.PP
	1071	.Vb 6
	1072	\& @chars = $martian =~ m/([A-Z][A-Z]\|[^A-Z])/g;
	1073	\& # above is conceptually similar to: @chars = $text =~ m/(.)/g;
	1074	\& #
	1075	\& foreach $char (@chars) {
	1076	\& print "found GX!\en", last if $char eq 'GX';
	1077	\& }
	1078	.Ve
	1079	.PP
	1080	Or like this:
	1081	.PP
	1082	.Vb 3
	1083	\& while ($martian =~ m/\eG([A-Z][A-Z]\|.)/gs) { # \eG probably unneeded
	1084	\& print "found GX!\en", last if $1 eq 'GX';
	1085	\& }
	1086	.Ve
	1087	.PP
	1088	Here's another, slightly less painful, way to do it from Benjamin
	1089	Goldberg, who uses a zero-width negative look-behind assertion.
	1090	.PP
	1091	.Vb 5
	1092	\& print "found GX!\en" if $martian =~ m/
	1093	\& (?<![A-Z])
	1094	\& (?:[A-Z][A-Z])*?
	1095	\& GX
	1096	\& /x;
	1097	.Ve
	1098	.PP
	1099	This succeeds if the \(L"martian\(R" character \s-1GX\s0 is in the string, and fails
	1100	otherwise. If you don't like using (?<!), a zero-width negative
	1101	look-behind assertion, you can replace (?<![A\-Z]) with (?:^\|[^A\-Z]).
	1102	.PP
	1103	It does have the drawback of putting the wrong thing in $\-[0] and $+[0],
	1104	but this usually can be worked around.
	1105	.Sh "How do I match a pattern that is supplied by the user?"
	1106	.IX Subsection "How do I match a pattern that is supplied by the user?"
	1107	Well, if it's really a pattern, then just use
	1108	.PP
	1109	.Vb 2
	1110	\& chomp($pattern = <STDIN>);
	1111	\& if ($line =~ /$pattern/) { }
	1112	.Ve
	1113	.PP
	1114	Alternatively, since you have no guarantee that your user entered
	1115	a valid regular expression, trap the exception this way:
	1116	.PP
	1117	.Vb 1
	1118	\& if (eval { $line =~ /$pattern/ }) { }
	1119	.Ve
	1120	.PP
	1121	If all you really want is to search for a string, not a pattern,
	1122	then you should either use the \fIindex()\fR function, which is made for
	1123	string searching, or, if you can't be disabused of using a pattern
	1124	match on a non\-pattern, then be sure to use \f(CW\(C`\eQ\(C'\fR...\f(CW\(C`\eE\(C'\fR, documented
	1125	in perlre.
	1126	.PP
	1127	.Vb 1
	1128	\& $pattern = <STDIN>;
	1129	.Ve
	1130	.PP
	1131	.Vb 5
	1132	\& open (FILE, $input) or die "Couldn't open input $input: $!; aborting";
	1133	\& while (<FILE>) {
	1134	\& print if /\eQ$pattern\eE/;
	1135	\& }
	1136	\& close FILE;
	1137	.Ve
	1138	.SH "AUTHOR AND COPYRIGHT"
	1139	.IX Header "AUTHOR AND COPYRIGHT"
	1140	Copyright (c) 1997\-2006 Tom Christiansen, Nathan Torkington, and
	1141	other authors as noted. All rights reserved.
	1142	.PP
	1143	This documentation is free; you can redistribute it and/or modify it
	1144	under the same terms as Perl itself.
	1145	.PP
	1146	Irrespective of its distribution, all code examples in this file
	1147	are hereby placed into the public domain. You are permitted and
	1148	encouraged to use this code in your own programs for fun
	1149	or for profit as you see fit. A simple comment in the code giving
	1150	credit would be courteous but is not required.