In [1]:
import re

Regular Expressions

Basic Regular Expression Patterns

In [2]:
# neither S nor s
re.compile(r'[^Ss]+').findall("Some strings.")
Out[2]:
['ome ', 'tring', '.']
In [3]:
# not a period
re.compile(r'[^\.]+').findall("work")
Out[3]:
['work']
In [4]:
# Either e or ^
re.compile(r'[e^]+').findall("egg, ^")
Out[4]:
['e', '^']
In [5]:
# a^b
re.compile(r'[a^b]+').findall("hello, a^b.")
Out[5]:
['a^b']
In [6]:
# optional elements
re.compile(r'colou?r').findall("color, colour")
Out[6]:
['color', 'colour']
In [7]:
# an integer
re.compile(r'[0-9][0-9]*').findall("2")
Out[7]:
['2']
In [8]:
# an integer
re.compile(r'[0-9]+').findall("2")
Out[8]:
['2']
In [9]:
# any single character
re.compile(r'beg.n').findall("begin, beg'n, begun")
Out[9]:
['begin', "beg'n", 'begun']
In [10]:
# begin and end
# \. means . is a period not athe wildcard
re.compile(r'dog\.$').findall("the dog.")
Out[10]:
['dog.']
In [11]:
# boundary
re.compile(r'\bthe\b').findall("other, the, $they")
Out[11]:
['the']

Disjunction, Grouping, and Precedence

In [12]:
# disjunction
re.compile(r'cat|dog').findall("there are a cat and a dog.")
Out[12]:
['cat', 'dog']
In [13]:
# precedence
re.compile(r'gupp(y|ies)').findall("guppy and guppies.")
Out[13]:
['y', 'ies']
In [14]:
re.compile(r'Column [0-9]+ *').findall("Column 1 Column 2 Column 3 Column 4.")
Out[14]:
['Column 1 ', 'Column 2 ', 'Column 3 ', 'Column 4']
In [15]:
# () as a whole
re.compile(r'(Column [0-9]+ *)*').findall("Column 1 Column 2 Column 3 Column 4.")
Out[15]:
['Column 4', '', '']
In [16]:
# counters have a higher precedence than sequences, cannot match "theny"
re.compile(r'the|any').findall("the, any, theny")
Out[16]:
['the', 'any', 'the']

A Simple Example

In [17]:
# mathch word "the"
print(re.compile(r'the').findall("the, The, the_, the25"))
print(re.compile(r'[tT]he').findall("the, their, the_, the25"))
print(re.compile(r'\b[tT]he\b').findall("the, their, the_, the25"))
print(re.compile(r'[^a-zA-Z][tT]he[^a-zA-Z]').findall("the, their, the_, the25"))
print(re.compile(r'(^|[^a-zA-Z])[tT]he([^a-zA-Z]|$)').findall("the, their, the_, the25"))
['the', 'the', 'the']
['the', 'the', 'the', 'the']
['the']
[' the_', ' the2']
[('', ','), (' ', '_'), (' ', '2')]

A More Complex Example

In [18]:
p = re.compile(r'\$[0-9]{0,3}(\.[0-9]+)?\b')
for item in p.finditer("$199.9, one is $199.99. and the other is $199. the last is $1999999.99 . a$199.1"):
    print(item)
<_sre.SRE_Match object; span=(0, 6), match='$199.9'>
<_sre.SRE_Match object; span=(15, 22), match='$199.99'>
<_sre.SRE_Match object; span=(41, 45), match='$199'>
<_sre.SRE_Match object; span=(59, 60), match='$'>
<_sre.SRE_Match object; span=(74, 80), match='$199.1'>
In [19]:
p = re.compile(r'(^|\W)\$[0-9]{0,3}(\.[0-9]+)?\b')
for item in p.finditer("$199.9, one is $199.99. and the other is $199. the last is $1999999.99 . a$199.1"):
    print(item)
<_sre.SRE_Match object; span=(0, 6), match='$199.9'>
<_sre.SRE_Match object; span=(14, 22), match=' $199.99'>
<_sre.SRE_Match object; span=(40, 45), match=' $199'>
<_sre.SRE_Match object; span=(58, 60), match=' $'>

Regular Expression Substitution, Capture Groups, and ELIZA

In [20]:
for item in re.compile(r'the (.*)er they were, the \1er they will be').finditer(
    "the bigger they were, the bigger they will be but not the bigger they were, the faster they will be."):
    print(item)
<_sre.SRE_Match object; span=(0, 45), match='the bigger they were, the bigger they will be'>
In [21]:
for item in re.compile(r'the (.*)er they (.*), the \1er we \2').finditer(
    "the faster they ran, the faster we ran but not the faster they ran, the faster we ate."):
    print(item)
<_sre.SRE_Match object; span=(0, 38), match='the faster they ran, the faster we ran'>
In [22]:
# non-capturing
for item in re.compile(r'(?:some|a few) (people|cats) like some \1').finditer(
    "a few cats like some cats but not some cats like some a few."):
    print(item)
<_sre.SRE_Match object; span=(0, 25), match='a few cats like some cats'>
In [23]:
print(re.match("([abc])+", "abc").group())
print(re.match("(?:[abc])+", "abc").group())
abc
abc
In [24]:
print(re.match("([abc])+", "abc").groups())
print(re.match("(?:[abc])+", "abc").groups())
('c',)
()

Lookahead assertions

In [25]:
# 前向
test = re.compile(r'^(?=Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))
test = re.compile(r'^(?!Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))
['Volcano']
[]
In [26]:
# 后向
test = re.compile(r'^(?<=Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))
test = re.compile(r'^(?<!Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))
[]
['Volcano']
In [ ]: