Our text: The Zen of Python

In [ ]:
!cat zen.txt
In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    print if /never/;
 }
In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    print if /is better than/;
 }

Case-sensitive by default

In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    print if /simple/;
 }
In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    print if /simple/i;
 }

Wildcards

In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    print if / ... /;
 }

Extracting

In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    print "$1\n" if / (...) /;
 }
In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    if (/(....) to (.....)/) {
      print;
      print "$1, $2\n";
      }
 }

* means "any number"

In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    if (/(.*) to (.*)/) {
      print;
      print "$1, $2\n";
      }
 }

Character classes

In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    print "$1\n" if / ([aeiou].....) /i;
 }
In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    print "$1\n" if / ([aeiou][^ ]*) /i;
 }
In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    if (/([^ ]*) to ([^ ]*)/) {
      print;
      print "$1, $2\n";
      }
 }

Hmm. How to get rid of those trailing punctuations?

character class shortcuts

In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    if (/([a-z]*) to ([a-z]*)/) {
      print;
      print "$1, $2\n";
      }
 }
In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    if (/([\w]*) to ([\w]*)/) {
      print;
      print "$1, $2\n";
      }
 }

Search and replace

In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    $_ =~ s/is/might be/; 
    print $_;
 }
In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    $_ =~ s/ (\w*) to / \1, really very \1, to /; 
    print $_;
 }

splitting

In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    if (/ to /) {
      print
      print split / /;
      }
 }
In [ ]:
%%script perl
$data = "I can be reached at (937) 395-2343 or 122-4235, unless it's raining."
if ($data =~ /\d+-\d+/

Verbose mode

This flag allows you to write regular expressions that look nicer. Whitespace within the pattern is ignored, except when in a character class or preceded by an unescaped backslash, and, when a line contains a '#' neither in a character class or preceded by an unescaped backslash, all characters from the leftmost such '#' through the end of the line are ignored.

In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    if (/(\b[A-Z][a-z]+\b)/) {
      print $1;
      print "\n";
      }
 }

Random question: How to display all the capitalized words?

But anyway:

In [ ]:
%%script perl
open (ZENFILE, 'zen.txt');
  while (<ZENFILE>) {
    if (/ # Capitalized word detector
          (       # capture the result in $1
           \b     # word boundary
           [A-Z]  # one capital letter
           [a-z]+ # one or more lowercase letters
           \b     # word boundary
           )      # end captured group
          /x) {
      print $1;
      print "\n";
      }
 }

What are we skipping?

Lots:

  • Lookaheads

Verbal expressions

see wiki

In [ ]:
from verbalexpressions import VerEx
verbal_expression = VerEx()
In [ ]:
# Create an example of how to test for correctly formed URLs
verbal_expression = VerEx()
tester = (verbal_expression.
            start_of_line().
            find('http').
            maybe('s').
            find('://').
            maybe('www.').
            anything_but(' ').
            end_of_line()
)

tester.source()
In [ ]:
tester.match("https://www.google.com")
In [ ]:
tester.match("my nifty website")
In [ ]:
result = tester.match("https://www.google.com")
In [ ]:
result.groups()
In [ ]:
VerEx().anything().source()
In [ ]:
VerEx().find('cows').match('how do you like Cows?')
In [ ]:
ve2 = VerEx().find("ftp").or().find("http").maybe("s").then("://")

Tragically, they haven't yet set it up for pip install. I cloned from git & then created a symlink:

ln -s ~/sw/RegExpBuilder/python/regexpbuilder regexpbuilder
In [ ]:
from regexpbuilder.RegExpBuilder import RegExpBuilder
In [ ]:
builder1 = (RegExpBuilder()
  .find("$")
  .min(1).digits()
  .then(".")
  .digit()
  .digit())
In [ ]:
builder1.search("is there money in $2.22?")
In [ ]:
money = builder1.get_regexp()
result = money.search("is there money in $2.22?")
result
In [ ]:
result.group(0)
In [ ]:
discount = (RegExpBuilder()
            .find("was").then(money)
            .maybe(",").find("now").then(money)
            )
In [ ]:
money
In [ ]:
type(money)