#!/usr/bin/env python # coding: utf-8 # In[1]: import re # ## Regular Expressions # ### Basic Regular Expression Patterns # In[2]: # neither S nor s re.compile(r'[^Ss]+').findall("Some strings.") # In[3]: # not a period re.compile(r'[^\.]+').findall("work") # In[4]: # Either e or ^ re.compile(r'[e^]+').findall("egg, ^") # In[5]: # a^b re.compile(r'[a^b]+').findall("hello, a^b.") # In[6]: # optional elements re.compile(r'colou?r').findall("color, colour") # In[7]: # an integer re.compile(r'[0-9][0-9]*').findall("2") # In[8]: # an integer re.compile(r'[0-9]+').findall("2") # In[9]: # any single character re.compile(r'beg.n').findall("begin, beg'n, begun") # In[10]: # begin and end # \. means . is a period not athe wildcard re.compile(r'dog\.$').findall("the dog.") # In[11]: # boundary re.compile(r'\bthe\b').findall("other, the, $they") # ### Disjunction, Grouping, and Precedence # In[12]: # disjunction re.compile(r'cat|dog').findall("there are a cat and a dog.") # In[13]: # precedence re.compile(r'gupp(y|ies)').findall("guppy and guppies.") # In[14]: re.compile(r'Column [0-9]+ *').findall("Column 1 Column 2 Column 3 Column 4.") # In[15]: # () as a whole re.compile(r'(Column [0-9]+ *)*').findall("Column 1 Column 2 Column 3 Column 4.") # In[16]: # counters have a higher precedence than sequences, cannot match "theny" re.compile(r'the|any').findall("the, any, theny") # ### A Simple Example # In[17]: # mathch word "the" print(re.compile(r'the').findall("the, The, the_, the25")) print(re.compile(r'[tT]he').findall("the, their, the_, the25")) print(re.compile(r'\b[tT]he\b').findall("the, their, the_, the25")) print(re.compile(r'[^a-zA-Z][tT]he[^a-zA-Z]').findall("the, their, the_, the25")) print(re.compile(r'(^|[^a-zA-Z])[tT]he([^a-zA-Z]|$)').findall("the, their, the_, the25")) # ### A More Complex Example # In[18]: p = re.compile(r'\$[0-9]{0,3}(\.[0-9]+)?\b') for item in p.finditer("$199.9, one is $199.99. and the other is $199. the last is $1999999.99 . a$199.1"): print(item) # In[19]: p = re.compile(r'(^|\W)\$[0-9]{0,3}(\.[0-9]+)?\b') for item in p.finditer("$199.9, one is $199.99. and the other is $199. the last is $1999999.99 . a$199.1"): print(item) # ### Regular Expression Substitution, Capture Groups, and ELIZA # In[20]: for item in re.compile(r'the (.*)er they were, the \1er they will be').finditer( "the bigger they were, the bigger they will be but not the bigger they were, the faster they will be."): print(item) # In[21]: for item in re.compile(r'the (.*)er they (.*), the \1er we \2').finditer( "the faster they ran, the faster we ran but not the faster they ran, the faster we ate."): print(item) # In[22]: # non-capturing for item in re.compile(r'(?:some|a few) (people|cats) like some \1').finditer( "a few cats like some cats but not some cats like some a few."): print(item) # In[23]: print(re.match("([abc])+", "abc").group()) print(re.match("(?:[abc])+", "abc").group()) # In[24]: print(re.match("([abc])+", "abc").groups()) print(re.match("(?:[abc])+", "abc").groups()) # ### Lookahead assertions # In[25]: # 前向 test = re.compile(r'^(?=Volcano)[a-zA-Z]+') print(test.findall("Volcano I")) test = re.compile(r'^(?!Volcano)[a-zA-Z]+') print(test.findall("Volcano I")) # In[26]: # 后向 test = re.compile(r'^(?<=Volcano)[a-zA-Z]+') print(test.findall("Volcano I")) test = re.compile(r'^(?