import re
# Search sub-string in text
text = "Today is good day to learn regular expression."
# Define re pattern
pattern = r'regular expression'
# Search if there is 'regular expression' in the text
match = re.search(pattern, text)
# Check result
start_index = match.span()[0]
end_index = match.span()[1]
match_string = match.group()
print("The location of '{}' is between {} and {} in text".format(match_string, start_index, end_index))
The location of 'regular expression' is between 27 and 45 in text
# Demo
# Ordinary literal in regular expression
import re
# The text which re applies to
text = "Today is Monday, tommorrow is Tuesday."
# define pattern
pattern = r'Monday'
# search pattern in the text
match = re.search(pattern, text)
# Check
print(type(match))
start_index = match.span()[0]
end_index = match.span()[1]
match_string = match.group()
print("The location of '{}' is between {} and {} in text".format(match_string, start_index, end_index))
<class '_sre.SRE_Match'> The location of 'Monday' is between 9 and 15 in text
# Ordinary literal in regular expression
# re cannot find a match in the text
import re
# The text which re applies to
text = "Today is Monday, tommorrow is Tuesday."
# Specify pattern
pattern = r'Wednesday'
# Apply re to the text
match = re.search(pattern, text)
# Print out result
print(type(match))
print(match)
<class 'NoneType'> None
import re
# Define pattern
pattern = r'cookie'
text = "Cake and cookie"
# Non-Compiled Version
match = re.search(pattern, text)
print(match.group())
cookie
以下列舉出幾個在使用 re 模組時,常用的類別與函式
若程式在 string 中找到符合pattern的字串,則會回傳一個型態為 Match 的物件。反之,擇回傳一個 None 物件。
Note
與 re.match 不同之處在於,只要 string 中的任何位置出現符合 pattern 的字串, 就會回傳 Match 物件
# re.search
import re
# text which re applies to
text = "Cake and cookie"
# Define pattern
pattern = "cookie"
# search pattern in the text
match = re.search(pattern, text)
# Check
print("Type of 'match' object:", type(match))
print("Matching pattern: %r" % match.group())
Type of 'match' object: <class '_sre.SRE_Match'> Matching pattern: 'cookie'
Arguemnts
定義好的正規表示式(e.g r'cookie')
欲套用正規表示式之文字資料
預設值為0
Return Value
若程式在 string 中找到符合pattern的字串,則會回傳一個型態為 Match 的物件。反之,擇回傳一個 None 物件。
Note match 函式只會比對 string 開頭的文字,若 string 的中間部份有符合 pattern 的字串擇不會回傳 Match 物件。
# demo
# re.match
import re
# texts which re applies to
text1 = "Cake and cookie"
text2 = "cookie and Cake"
# specify pattern
pattern = "cookie"
# search pattern in texts
match1 = re.match(pattern, text1)
match2 = re.match(pattern, text2)
# Check
print("Type of 'match1' object:", type(match1))
print(match1)
print("="*50)
print("Type of 'match2' object:", type(match2))
print("Matching pattern: %r" % match2.group())
Type of 'match1' object: <class 'NoneType'> None ================================================== Type of 'match2' object: <class '_sre.SRE_Match'> Matching pattern: 'cookie'
Arguements
定義好的正規表示式(e.g r'cookie')
可以是字串抑或是函式(可呼叫的物件)。若為字串則會將符合 pattern 的字串轉成 replace。若為函式,則會將函式的回傳值當作替代的文字。
欲套用正規表示式之文本字串
預設為0,會將所有符合 pattern 的字串替代成相對應的字串(replace)。若為大於0的值,則只會將出現 count 次數次符合 pattern 的字串做轉換。
預設值為0
Return Value
回傳一個做過轉換的新的字串
# Demo
# re.sub with plain string
import re
# text which re applies to
text = "cookie and Cake"
# Define pattern
pattern = r'cookie'
# substitute the string 'cookie' with 'biscuit'
new_text = re.sub(pattern, 'biscuit', text)
# Check
print("Original text:", text)
print("After substitution:", new_text)
Original text: cookie and Cake After substitution: biscuit and Cake
# Demo
# re.sub with customed function
import re
# text which re applies to
text = "cookie and Cake"
# Define pattern
pattern = r'cookie'
# Custom replace function
def repl(match):
new_string = match.group()+'-'+match.group()
return new_string
new_text = re.sub(pattern, repl, text)
print("Original text:", text)
print("After substitution:", new_text)
Original text: cookie and Cake After substitution: cookie-cookie and Cake
Arguements
欲快取住的正規表示式
預設值為0
Return Value
回傳一個 Pattern 型態的物件
# Demo
# re.compile
import re
# texts which re applies to
text = "cookie and Cake"
# Define pattern
pattern = r'cookie'
# Compile
regex = re.compile(pattern)
# search pattern
new_text1 = regex.sub('biscuit', text)
new_text2 = re.sub(r'cookie', 'biscuit', text)
print(new_text1)
print(new_text2)
biscuit and Cake biscuit and Cake
%%time
# Newbie style
text1 = 'Cake and cookie'
text2 = 'cookie and Cake'
text3 = 'Cake cookie'
# search pattern
for i in range(100):
match1 = re.search(r'cookie', text1)
match2 = re.search(r'cookie', text2)
match3 = re.search(r'cookie', text3)
CPU times: user 275 µs, sys: 25 µs, total: 300 µs Wall time: 304 µs
%%time
# Prof style
text1 = 'Cake and cookie'
text2 = 'cookie and Cake'
text3 = 'Cake cookie'
# Compile regex
regex = re.compile(r'cookie')
# search pattern
for i in range(100):
match1 = regex.search(text1)
match2 = regex.search(text2)
match3 = regex.search(text3)
CPU times: user 101 µs, sys: 9 µs, total: 110 µs Wall time: 113 µs
Arguements
預設值為 group 0,代表整個 matching 的字串。group 可以為所有合法 group 的索引值(e.g 1, 2, 3, etc),
Return Value
Tuple 物件,表示此 group 在文本中的位置(起始位置, 結束位置)
# 5 min practice
# match.span
import re
text = 'Cake and cookie'
# Define pattern
pattern = r'cookie'
# search pattern
match = re.search(pattern, text)
# Check
print("Match starting index:", match.span()[0])
print("Match ending index:", match.span()[1])
print("Result String:", text[match.span()[0]:match.span()[1]])
Match starting index: 9 Match ending index: 15 Result String: cookie
group1 若沒指定則 default 值為 0,代表回傳整串 match 的字串。
groupN 可以為任意有效的 group 索引值(e.g 1, 2, 3, etc)。
回傳屬於那個 group 的字串,若參數 [group1, ...] 大於1以上,則回傳 tuple 包含所有 group 的字串。
若查找的字串不符合 group 中的定義則回傳 None。
# Recap
# Only one parameter in match.group
import re
text = 'Cake and cookie'
# Define pattern
pattern = r'cookie'
# search pattern
match = re.search(pattern, text)
# Check
print("Entire matching string:", match.group())
print("Entire matching string:", match.group(0))
Entire matching string: cookie Entire matching string: cookie
# Demo
# two or more parameter in match.group
import re
text = 'Cake and cookie'
# Define pattern: () will define a group
pattern = r'(Cake) and (cookie)'
# search pattern
match = re.search(pattern, text)
# Check: only one argument
print("Group1 matching string:", match.group(1))
print("Group2 matching string:", match.group(2))
# Check: two arguments
print("Group1 and Group2 matching strings:", match.group(1, 2))
# Error
# print("Group3 matching string:", match.group(3))
Group1 matching string: Cake Group2 matching string: cookie Group1 and Group2 matching strings: ('Cake', 'cookie')
# match.groups()
import re
text = 'Cake and cookie'
# Define pattern
pattern = r'(Cake) and (cookie)?'
# search pattern
match = re.search(pattern, text)
# Check
print("Groups of match:", match.groups())
Groups of match: ('Cake', 'cookie')
回傳一個字典物件,包含了所有一串列被命名的 groups, 鍵值(key) 為命名的名稱. 值(Value) 為 group 中的字串。
?P: 命名 group 前必須加的前綴字
group_name: group 的名稱
pattern: group 的正規表示式
# Demo
# match.groupdict
import re
text = "Cake and cookie"
# Define pattern
pattern = r'(?P<fatter>Cake) and (?P<fat>cookie)'
# search pattern
match = re.search(pattern, text)
# Check
d = match.groupdict()
print(type(d))
print(d)
<class 'dict'> {'fat': 'cookie', 'fatter': 'Cake'}
若是正規表示式只能使用平字(plain text),那他的功能也太弱了吧...。 所以除了平字之外它還可以有其他的特殊文字用來表示集合,特殊符號,旗標等等其他更較為有彈性且強大的功能。
Character | Meaning |
---|---|
. | Match any single character except newline('\n') |
\w | Match any singel letter, digit, or underscore |
\W | Match any character not part of \w |
\s | Match a singel whitespace character like: space, newline, tab, return |
\S | Match any character not part or \s |
\t | Match tab |
\n | Match newline |
\r | Match return |
\d | Match decimal digit 0-9 |
^ | Match a pattern at the start of the string |
$ | Match a pattern at the end of the string |
\A | Match only at the start of the string |
\b | Match only the beginning or end of the word |
\ | Match special character |
[...] | Match character that appears between '[ ]' |
[^...] | Match character that does not appear in '[ ]' |
還有其他很多特殊文字,所以有興趣可以自己在參考 Python re 的文件
# demo
# \w and \d practice
# Extract CSIE and room_id
import re
# text which re applies to
text = "CSIE-65405"
# specify pattern
pattern = r'(?P<department>\w\w\w\w)-(?P<room_id>\d\d\d\d\d)'
# search pattern
match = re.search(pattern, text)
# Check
print(match.groupdict())
{'room_id': '65405', 'department': 'CSIE'}
請利用 re 模組,寫一個程式讓使用者能夠輸入名字,並且讓程式取出其 lastname 和 firstname。
# 5 mins for coding
# [...] practice
# Extract CSIE and room_id
import re
text = "CSIE-65405"
# define pattern
pattern = r'(?P<department>[a-zA-Z][a-zA-Z][a-zA-Z][a-zA-Z])-(?P<room_id>[0-9][0-9][0-9][0-9][0-9])'
# search pattern in text
match = re.search(pattern, text)
# check
print(match.groupdict())
{'room_id': '65405', 'department': 'CSIE'}
請利用 re 模組,寫一個簡單的使用者驗證系統,使用者提供帳號,密碼,程式判段是否為合法的帳號密碼。
帳號格式:
密碼格式:
# Example
import re
class AuthSystem:
def __init__(self):
"""Define regex"""
self.username_regex = re.compile(r'johnny')
self.password_regex = re.compile(r'johnny860410')
def _check_username(self, username):
"""Check username is valid or not"""
if self.username_regex.search(username) is not None:
print("Correct username")
return True
else:
print("Wrong username")
return False
def _check_password(self, password):
"""Check password is valid or not"""
if self.password_regex.search(password) is not None:
print("Correct password")
return True
else:
print("Wrong password")
return False
def authenticate(self, username, password):
"""authenticate the user"""
if not self._check_username(username):
return
if not self._check_password(password):
return
print("Valid user")
# Construct a object of type AuthSystem
auth = AuthSystem()
# authenticate the user's credentials
auth.authenticate("johnny", "johnny860410")
Correct username Correct password Valid user
Character | Meaning |
---|---|
+ | Match one or more characters to its left |
* | Match zero or more characters to its left |
? | Match zero or one character to its left |
{x} | Match 'x' times of character to its left |
{x,} | Match 'x' or more times fo charater to its left |
{x, y} | Match 'x' or more times but less than 'y' times of character to its left |
# Demo
# '+' practice
import re
# texts which re applies to
text1 = " and cookie"
text2 = "CakeCakeCake and cookie"
# Define pattern
plus_pattern = "(Cake)+ and cookie"
# search pattern in texts #
plus_match1 = re.search(plus_pattern, text1)
plus_match2 = re.search(plus_pattern, text2)
# check
print(plus_match1)
print(plus_match2.group())
None CakeCakeCake and cookie
# Demo
import re
# texts which re applies to
text1 = " and cookie"
text2 = "CakeCakeCake and cookie"
# Define pattern
mul_pattern = "(Cake)* and cookie"
# search pattern in texts
mul_match1 = re.search(mul_pattern, text1)
mul_match2 = re.search(mul_pattern, text2)
# check
print(mul_match1.group())
print(mul_match2.group())
and cookie CakeCakeCake and cookie
# 5 mins for coding
# '+' practice
import re
# texts which re applies to
text1 = " and cookie"
text2 = "Cake and cookie"
# Define pattern
ques_pattern = "(Cake)? and cookie"
# search pattern in texts
ques_match1 = re.search(ques_pattern, text1)
ques_match2 = re.search(ques_pattern, text2)
# Check
print(ques_match1.group())
print(ques_match2.group())
and cookie Cake and cookie
請延續上一題的練習題,擴充其功能,讓能夠接受(accept)的帳號和密碼格式更複雜。
帳號格式:
密碼格式:
正規表示式在做查找(match)的時候,會盡量查找(match)到最長符合的字串,這種行為我們稱為 Greedy match,但有時這些行為不是我們所期望的。
在最短符合的字串就停止查找,這種行為我們稱為 Non-Greedy match。
# Greedy Match
import re
text = "<h1> Title </h1>"
# Define pattern
pattern = r'<.*>'
# search pattern
match = re.search(pattern, text)
# Check
print(match.group())
<h1> Title </h1>
# Non-Greedy match
import re
text = "<h1> Title </h1>"
# Define pattern
pattern = r'<.*?>'
# search pattern
match = re.search(pattern, text)
# Check
print(match.group())
<h1>