#!/usr/local/bin/perl
# Perl "Pattern Matching" Frequently Asked Questions
#
# This is NOT an exhaustive treatment of pattern
# matching in Perl. This is a collection of fairly
# simple syntax examples.
# Perl's regex engine is quite powerful.
# For a complete treatment of pattern matching with Perl
# see 'Programming Perl' by Larry Wall, Tom Christiansen,
# and Randel Schwartz and 'Mastering Regular Expressions'
# by Aligator De'scarte.....both published by O'Reilly.
#
# You should be able to copy/paste this entire faq into
# a file and run it. There is one example that is
# commented out completely because it would try to open a
# non-existent file.
#
# Now, from the simple to the more complex.....
################ THE /match/ SYNTAX ####################
$str = 'a string of words with no punctuation';
# Perl's default match punctuation is /pattern/. You
# can use other delimiters, but this is a FAQ, not a book.
# The match operator can be used against $_, Perl's
# 'current' var.
$_ = $str;
if (/string/) { print "matched 'string'\n"; }
# In order to be clear, we will be verbose and explicitly
# operate on a scalar variable ($str) using the '=~' operator.
if ($str =~ /string/) { print "\nmatched 'string' again.\n"; }
# make the match case-insensitive with a switch, '[red]i[/red]'.
if ($str =~ /strinG/[red]i[/red])
{ print "\nmatched strinG to string, ignoring case\n"; }
# use character classes, like digits or white space
if ($str =~ /[red]\d[/red]/) { print "\nmatched a digit, $&\n"; }
else { print "\nThere are no digits in the string.\n"; }
# put digits in the string.
$str = 'string of words and digits 5678 and no punctuation';
# check to see if the str contains a digit
if ($str =~ /[red]\d[/red]/) { print "\nmatched one digit, $&\n"; }
# find the first 2 digits in the string
if ($str =~ /[red]\d\d[/red]/) { print "\nmatched two digits, $&\n"; }
# that only matched the '56'. Get the others by using the
# '+' to match one or more times.
if ($str =~ /[red]\d+[/red]/) { print "\nMatched multiple digits - $&\n"; }
# match 2 or 3 digits with {2,3} - note the match is gready.
# If it can find 3 it will match 3, not 2.
if ($str =~ /[red]\d{2,3}[/red]/) { print "\nMatched 2 or 3 digits - $&\n"; }
# look for white space chars (space, tab, etc...)
if ($str =~ /[red]\s[/red]/) { print "\nmatched white space, -$&-\n"; }
# look for a digit followed by a white space char
if ($str =~ /[red]\d\s[/red]/) { print "\nmatched digit space, -$&-\n"; }
# look for an alpha char then space then a digit.
if ($str =~ /[red][a-z]\s\d[/red]/)
{ print "\nmatched alpha space digit, -$&-\n"; }
# a practical example - look for things that look like IP addresses.
# parse an access log for IP addresses
$str = '<date>10/16/2000</date><remoteHost>123.234.9.78</remoteHost>';
if ($str =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/)
{ #{1,3} says find at least 1 and no more than 3 of \d.
# we must escape the dots \. which are normally wild cards
# to make them match dots.
# you might have notice already that $& contains the last match
$remoteHost = $&;
print "\nFound an IP - $remoteHost\n";
# $` contains the part before the match
print "part before the IP - $`\n";
# $' contains the part after the match
print "part after the IP - $'\n";
}
# or parse the tags and catch remoteHost
if ($str =~ /(<remoteHost>)(.*?)(<\/remoteHost>)/)
{ #(.*?) catches the match in $2
$remoteHost = $2;
print "\nFound an IP - $remoteHost in between $1 and $3\n";
# note that each new regex resets $1, $2, $3, etc...
}
# You might want to run through a web log looking for
# all the occurrences of the <remoteHost>...</remoteHost>
# chunk. You can read the entire file into a var, and
# catch all of the IP's like this.
# open(OPF,"<your_web_log.txt") or die("Failed to open log, $!.\n";
# while (<IPF>) { $web_log .= $_; }
# close OPF;
# while ($web_log =~ /<remoteHost>(.*?)<\/remoteHost>/g)
# { print "Host: $1\n"; }
#
# 'g' at the end of the match makes the match 'global'.
# If the 'g' were not there, the match would happen once
# and stop. With the 'g', the match will happen as many
# times as the pattern occurs in the string in $web_log.
# You can also use .....
# . is a wild card - matches anything
# ? says match minimally - not greedily
# \d - a digit
# \n - new line
# \r - carriage return
# \t - tab
# [A-Z] - upper case A through Z.
# [a-z] - lower case
# [^a-z] - the '^' makes it a negative class or anything other than a-z
# \s - white space
# \S - non-white space char
# There are others.....see the books.
################ THE 's/find/replace/' SYNTAX ####
print "\n\n";
print ' ######## PATTERN REPLACEMENT ##########';
$str = 'a string of words with no punctuation and the word stRiNg repeated';
# replacement syntax $str =~ [red]s[/red]/replaceThis/withThis/;
# replace the first 'string' with 'STRING'
print "\nbefore - $str\n";
$str =~ s/string/STRING/;
print "after - $str\n";
# replace all occurrences of 'string' with 'STRING' ignoring case
$str = 'a string of words with no punctuation and the word stRiNg repeated';
print "\nBefore global => $str\n";
$str =~ s/string/STRING/ig;
# the 'g' = global
print "After global =>$str\n\n";
# using an eval in the right side
$str =~ s/string/$&.++$i/geis;
print "NUMBERED =>$str\n\n";
# or, evaluate a call to a sub routine in the right side.
$str =~ s/string\d/&number_them($&)/gei;
sub number_them
{
my $piece = shift;
$number++;
$piece .= $number;
return ($piece);
}
print "NUMBERED AGAIN=>$str\n\n";
# There are also negative and positive look-aheads, that
# may be addressed in the future. Again, if you want to
# know how to use them, see one of the books.
# I hope this helps.