In [1]:
import re


## Regular Expressions¶

### Basic Regular Expression Patterns¶

In [2]:
# neither S nor s
re.compile(r'[^Ss]+').findall("Some strings.")

Out[2]:
['ome ', 'tring', '.']
In [3]:
# not a period
re.compile(r'[^\.]+').findall("work")

Out[3]:
['work']
In [4]:
# Either e or ^
re.compile(r'[e^]+').findall("egg, ^")

Out[4]:
['e', '^']
In [5]:
# a^b
re.compile(r'[a^b]+').findall("hello, a^b.")

Out[5]:
['a^b']
In [6]:
# optional elements
re.compile(r'colou?r').findall("color, colour")

Out[6]:
['color', 'colour']
In [7]:
# an integer
re.compile(r'[0-9][0-9]*').findall("2")

Out[7]:
['2']
In [8]:
# an integer
re.compile(r'[0-9]+').findall("2")

Out[8]:
['2']
In [9]:
# any single character
re.compile(r'beg.n').findall("begin, beg'n, begun")

Out[9]:
['begin', "beg'n", 'begun']
In [10]:
# begin and end
# \. means . is a period not athe wildcard
re.compile(r'dog\.$').findall("the dog.")  Out[10]: ['dog.'] In [11]: # boundary re.compile(r'\bthe\b').findall("other, the,$they")

Out[11]:
['the']

### Disjunction, Grouping, and Precedence¶

In [12]:
# disjunction
re.compile(r'cat|dog').findall("there are a cat and a dog.")

Out[12]:
['cat', 'dog']
In [13]:
# precedence
re.compile(r'gupp(y|ies)').findall("guppy and guppies.")

Out[13]:
['y', 'ies']
In [14]:
re.compile(r'Column [0-9]+ *').findall("Column 1 Column 2 Column 3 Column 4.")

Out[14]:
['Column 1 ', 'Column 2 ', 'Column 3 ', 'Column 4']
In [15]:
# () as a whole
re.compile(r'(Column [0-9]+ *)*').findall("Column 1 Column 2 Column 3 Column 4.")

Out[15]:
['Column 4', '', '']
In [16]:
# counters have a higher precedence than sequences, cannot match "theny"
re.compile(r'the|any').findall("the, any, theny")

Out[16]:
['the', 'any', 'the']

### A Simple Example¶

In [17]:
# mathch word "the"
print(re.compile(r'the').findall("the, The, the_, the25"))
print(re.compile(r'[tT]he').findall("the, their, the_, the25"))
print(re.compile(r'\b[tT]he\b').findall("the, their, the_, the25"))
print(re.compile(r'[^a-zA-Z][tT]he[^a-zA-Z]').findall("the, their, the_, the25"))
print(re.compile(r'(^|[^a-zA-Z])[tT]he([^a-zA-Z]|$)').findall("the, their, the_, the25"))  ['the', 'the', 'the'] ['the', 'the', 'the', 'the'] ['the'] [' the_', ' the2'] [('', ','), (' ', '_'), (' ', '2')]  ### A More Complex Example¶ In [18]: p = re.compile(r'\$[0-9]{0,3}(\.[0-9]+)?\b')
for item in p.finditer("$199.9, one is$199.99. and the other is $199. the last is$1999999.99 . a$199.1"): print(item)  <_sre.SRE_Match object; span=(0, 6), match='$199.9'>
<_sre.SRE_Match object; span=(15, 22), match='$199.99'> <_sre.SRE_Match object; span=(41, 45), match='$199'>
<_sre.SRE_Match object; span=(59, 60), match='$'> <_sre.SRE_Match object; span=(74, 80), match='$199.1'>

In [19]:
p = re.compile(r'(^|\W)\$[0-9]{0,3}(\.[0-9]+)?\b') for item in p.finditer("$199.9, one is $199.99. and the other is$199. the last is $1999999.99 . a$199.1"):
print(item)

<_sre.SRE_Match object; span=(0, 6), match='$199.9'> <_sre.SRE_Match object; span=(14, 22), match='$199.99'>
<_sre.SRE_Match object; span=(40, 45), match=' $199'> <_sre.SRE_Match object; span=(58, 60), match='$'>


### Regular Expression Substitution, Capture Groups, and ELIZA¶

In [20]:
for item in re.compile(r'the (.*)er they were, the \1er they will be').finditer(
"the bigger they were, the bigger they will be but not the bigger they were, the faster they will be."):
print(item)

<_sre.SRE_Match object; span=(0, 45), match='the bigger they were, the bigger they will be'>

In [21]:
for item in re.compile(r'the (.*)er they (.*), the \1er we \2').finditer(
"the faster they ran, the faster we ran but not the faster they ran, the faster we ate."):
print(item)

<_sre.SRE_Match object; span=(0, 38), match='the faster they ran, the faster we ran'>

In [22]:
# non-capturing
for item in re.compile(r'(?:some|a few) (people|cats) like some \1').finditer(
"a few cats like some cats but not some cats like some a few."):
print(item)

<_sre.SRE_Match object; span=(0, 25), match='a few cats like some cats'>

In [23]:
print(re.match("([abc])+", "abc").group())
print(re.match("(?:[abc])+", "abc").group())

abc
abc

In [24]:
print(re.match("([abc])+", "abc").groups())
print(re.match("(?:[abc])+", "abc").groups())

('c',)
()


In [25]:
# 前向
test = re.compile(r'^(?=Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))
test = re.compile(r'^(?!Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))

['Volcano']
[]

In [26]:
# 后向
test = re.compile(r'^(?<=Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))
test = re.compile(r'^(?<!Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))

[]
['Volcano']

In [ ]: