#!/usr/bin/env python
# coding: utf-8

# In[1]:


import re


# ## Regular Expressions

# ### Basic Regular Expression Patterns

# In[2]:


# neither S nor s
re.compile(r'[^Ss]+').findall("Some strings.")


# In[3]:


# not a period
re.compile(r'[^\.]+').findall("work")


# In[4]:


# Either e or ^
re.compile(r'[e^]+').findall("egg, ^")


# In[5]:


# a^b
re.compile(r'[a^b]+').findall("hello, a^b.")


# In[6]:


# optional elements
re.compile(r'colou?r').findall("color, colour")


# In[7]:


# an integer
re.compile(r'[0-9][0-9]*').findall("2")


# In[8]:


# an integer
re.compile(r'[0-9]+').findall("2")


# In[9]:


# any single character
re.compile(r'beg.n').findall("begin, beg'n, begun")


# In[10]:


# begin and end
# \. means . is a period not athe wildcard
re.compile(r'dog\.$').findall("the dog.")


# In[11]:


# boundary
re.compile(r'\bthe\b').findall("other, the, $they")


# ###  Disjunction, Grouping, and Precedence

# In[12]:


# disjunction
re.compile(r'cat|dog').findall("there are a cat and a dog.")


# In[13]:


# precedence
re.compile(r'gupp(y|ies)').findall("guppy and guppies.")


# In[14]:


re.compile(r'Column [0-9]+ *').findall("Column 1 Column 2 Column 3 Column 4.")


# In[15]:


# () as a whole
re.compile(r'(Column [0-9]+ *)*').findall("Column 1 Column 2 Column 3 Column 4.")


# In[16]:


# counters have a higher precedence than sequences, cannot match "theny"
re.compile(r'the|any').findall("the, any, theny")


# ### A Simple Example

# In[17]:


# mathch word "the"
print(re.compile(r'the').findall("the, The, the_, the25"))
print(re.compile(r'[tT]he').findall("the, their, the_, the25"))
print(re.compile(r'\b[tT]he\b').findall("the, their, the_, the25"))
print(re.compile(r'[^a-zA-Z][tT]he[^a-zA-Z]').findall("the, their, the_, the25"))
print(re.compile(r'(^|[^a-zA-Z])[tT]he([^a-zA-Z]|$)').findall("the, their, the_, the25"))


# ### A More Complex Example

# In[18]:


p = re.compile(r'\$[0-9]{0,3}(\.[0-9]+)?\b')
for item in p.finditer("$199.9, one is $199.99. and the other is $199. the last is $1999999.99 . a$199.1"):
    print(item)


# In[19]:


p = re.compile(r'(^|\W)\$[0-9]{0,3}(\.[0-9]+)?\b')
for item in p.finditer("$199.9, one is $199.99. and the other is $199. the last is $1999999.99 . a$199.1"):
    print(item)


# ### Regular Expression Substitution, Capture Groups, and ELIZA

# In[20]:


for item in re.compile(r'the (.*)er they were, the \1er they will be').finditer(
    "the bigger they were, the bigger they will be but not the bigger they were, the faster they will be."):
    print(item)


# In[21]:


for item in re.compile(r'the (.*)er they (.*), the \1er we \2').finditer(
    "the faster they ran, the faster we ran but not the faster they ran, the faster we ate."):
    print(item)


# In[22]:


# non-capturing
for item in re.compile(r'(?:some|a few) (people|cats) like some \1').finditer(
    "a few cats like some cats but not some cats like some a few."):
    print(item)


# In[23]:


print(re.match("([abc])+", "abc").group())
print(re.match("(?:[abc])+", "abc").group())


# In[24]:


print(re.match("([abc])+", "abc").groups())
print(re.match("(?:[abc])+", "abc").groups())


# ### Lookahead assertions

# In[25]:


# 前向
test = re.compile(r'^(?=Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))
test = re.compile(r'^(?!Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))


# In[26]:


# 后向
test = re.compile(r'^(?<=Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))
test = re.compile(r'^(?<!Volcano)[a-zA-Z]+')
print(test.findall("Volcano I"))


# In[ ]: