import pandas as pd
import numpy as np
import re as re
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'email': ['jas203@gmail.com', 'momomolly@gmail.com', np.NAN, 'battler@milner.com', 'Ames1234@yahoo.com'],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'email', 'preTestScore', 'postTestScore'])
df
first_name | last_name | preTestScore | postTestScore | ||
---|---|---|---|---|---|
0 | Jason | Miller | jas203@gmail.com | 4 | 25 |
1 | Molly | Jacobson | momomolly@gmail.com | 24 | 94 |
2 | Tina | Ali | NaN | 31 | 57 |
3 | Jake | Milner | battler@milner.com | 2 | 62 |
4 | Amy | Cooze | Ames1234@yahoo.com | 3 | 70 |
5 rows × 5 columns
df['email'].str.contains('gmail')
0 True 1 True 2 NaN 3 False 4 False Name: email, dtype: object
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
df['email'].str.findall(pattern, flags=re.IGNORECASE)
0 [(jas203, gmail, com)] 1 [(momomolly, gmail, com)] 2 NaN 3 [(battler, milner, com)] 4 [(Ames1234, yahoo, com)] Name: email, dtype: object
matches = df['email'].str.match(pattern, flags=re.IGNORECASE)
matches
0 (jas203, gmail, com) 1 (momomolly, gmail, com) 2 NaN 3 (battler, milner, com) 4 (Ames1234, yahoo, com) Name: email, dtype: object
matches.str[1]
0 gmail 1 gmail 2 NaN 3 milner 4 yahoo Name: email, dtype: object