In [1]:

import re

Regular Expressions¶

Basic Regular Expression Patterns¶

In [2]:

# neither S nor s
re.compile(r'[^Ss]+').findall("Some strings.")

Out[2]:

['ome ', 'tring', '.']

In [3]:

# not a period
re.compile(r'[^\.]+').findall("work")

Out[3]:

['work']

In [4]:

# Either e or ^
re.compile(r'[e^]+').findall("egg, ^")

Out[4]:

['e', '^']

In [5]:

# a^b
re.compile(r'[a^b]+').findall("hello, a^b.")

Out[5]:

['a^b']

In [6]:

# optional elements
re.compile(r'colou?r').findall("color, colour")

Out[6]:

['color', 'colour']

In [7]:

# an integer
re.compile(r'[0-9][0-9]*').findall("2")

Out[7]:

['2']

In [8]:

# an integer
re.compile(r'[0-9]+').findall("2")

Out[8]:

['2']

In [9]:

# any single character
re.compile(r'beg.n').findall("begin, beg'n, begun")

Out[9]:

['begin', "beg'n", 'begun']

In [10]:

# begin and end
# \. means . is a period not athe wildcard
re.compile(r'dog\.$').findall("the dog.")

Out[10]:

['dog.']

In [11]:

# boundary
re.compile(r'\bthe\b').findall("other, the, $they")

Out[11]:

['the']

Disjunction, Grouping, and Precedence¶

In [12]:

# disjunction
re.compile(r'cat|dog').findall("there are a cat and a dog.")

Out[12]:

['cat', 'dog']

In [13]:

# precedence
re.compile(r'gupp(y|ies)').findall("guppy and guppies.")

Out[13]:

['y', 'ies']

In [14]:

re.compile(r'Column [0-9]+ *').findall("Column 1 Column 2 Column 3 Column 4.")

Out[14]:

['Column 1 ', 'Column 2 ', 'Column 3 ', 'Column 4']

In [15]:

# () as a whole
re.compile(r'(Column [0-9]+ *)*').findall("Column 1 Column 2 Column 3 Column 4.")

Out[15]:

['Column 4', '', '']

In [16]:

# counters have a higher precedence than sequences, cannot match "theny"
re.compile(r'the|any').findall("the, any, theny")

Out[16]:

['the', 'any', 'the']

A Simple Example¶

In [17]:

# mathch word "the"
print(re.compile(r'the').findall("the, The, the_, the25"))
print(re.compile(r'[tT]he').findall("the, their, the_, the25"))
print(re.compile(r'\b[tT]he\b').findall("the, their, the_, the25"))
print(re.compile(r'[^a-zA-Z][tT]he[^a-zA-Z]').findall("the, their, the_, the25"))
print(re.compile(r'(^|[^a-zA-Z])[tT]he([^a-zA-Z]|$)').findall("the, their, the_, the25"))

['the', 'the', 'the']
['the', 'the', 'the', 'the']
['the']
[' the_', ' the2']
[('', ','), (' ', '_'), (' ', '2')]

A More Complex Example¶

In [18]:

p = re.compile(r'\$[0-9]{0,3}(\.[0-9]+)?\b')
for item in p.finditer("$199.9, one is $199.99. and the other is $199. the last is $1999999.99 . a$199.1"):
    print(item)

<_sre.SRE_Match object; span=(0, 6), match='$199.9'>
<_sre.SRE_Match object; span=(15, 22), match='$199.99'>
<_sre.SRE_Match object; span=(41, 45), match='$199'>
<_sre.SRE_Match object; span=(59, 60), match='$'>
<_sre.SRE_Match object; span=(74, 80), match='$199.1'>

In [19]:

p = re.compile(r'(^|\W)\$[0-9]{0,3}(\.[0-9]+)?\b')
for item in p.finditer("$199.9, one is $199.99. and the other is $199. the last is $1999999.99 . a$199.1"):
    print(item)

<_sre.SRE_Match object; span=(0, 6), match='$199.9'>
<_sre.SRE_Match object; span=(14, 22), match=' $199.99'>
<_sre.SRE_Match object; span=(40, 45), match=' $199'>
<_sre.SRE_Match object; span=(58, 60), match=' $'>

Regular Expression Substitution, Capture Groups, and ELIZA¶

In [20]:

for item in re.compile(r'the (.*)er they were, the \1er they will be').finditer(
    "the bigger they were, the bigger they will be but not the bigger they were, the faster they will be."):
    print(item)

<_sre.SRE_Match object; span=(0, 45), match='the bigger they were, the bigger they will be'>

In [21]:

for item in re.compile(r'the (.*)er they (.*), the \1er we \2').finditer(
    "the faster they ran, the faster we ran but not the faster they ran, the faster we ate."):
    print(item)

<_sre.SRE_Match object; span=(0, 38), match='the faster they ran, the faster we ran'>

In [22]:

# non-capturing
for item in re.compile(r'(?:some|a few) (people|cats) like some \1').finditer(
    "a few cats like some cats but not some cats like some a few."):
    print(item)

<_sre.SRE_Match object; span=(0, 25), match='a few cats like some cats'>

In [23]:

print(re.match("([abc])+", "abc").group())
print(re.match("(?:[abc])+", "abc").group())

abc
abc

In [24]:

print(re.match("([abc])+", "abc").groups())
print(re.match("(?:[abc])+", "abc").groups())

('c',)
()

Lookahead assertions¶

In [25]:

# 前向
test = re.compile(r'^(?=Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))
test = re.compile(r'^(?!Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))

['Volcano']
[]

In [26]:

# 后向
test = re.compile(r'^(?<=Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))
test = re.compile(r'^(?<!Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))

[]
['Volcano']

In [ ]: