import re
import pandas as pd
# Create a dataframe with a single column of strings
data = {'raw': ['Arizona 1 2014-12-23 3242.0',
'Iowa 1 2010-02-23 3453.7',
'Oregon 0 2014-06-20 2123.0',
'Maryland 0 2014-03-14 1123.6',
'Florida 1 2013-01-15 2134.0',
'Georgia 0 2012-07-14 2345.6']}
df = pd.DataFrame(data, columns = ['raw'])
df
raw | |
---|---|
0 | Arizona 1 2014-12-23 3242.0 |
1 | Iowa 1 2010-02-23 3453.7 |
2 | Oregon 0 2014-06-20 2123.0 |
3 | Maryland 0 2014-03-14 1123.6 |
4 | Florida 1 2013-01-15 2134.0 |
5 | Georgia 0 2012-07-14 2345.6 |
6 rows × 1 columns
# Which rows of df['raw'] contain 'xxxx-xx-xx'?
df['raw'].str.contains('....-..-..', regex=True)
0 True 1 True 2 True 3 True 4 True 5 True Name: raw, dtype: bool
# In the column 'raw', extract single digit in the strings
df['female'] = df['raw'].str.extract('(\d)')
df['female']
0 1 1 1 2 0 3 0 4 1 5 0 Name: female, dtype: object
# In the column 'raw', extract xxxx-xx-xx in the strings
df['date'] = df['raw'].str.extract('(....-..-..)')
df['date']
0 2014-12-23 1 2010-02-23 2 2014-06-20 3 2014-03-14 4 2013-01-15 5 2012-07-14 Name: date, dtype: object
# In the column 'raw', extract ####.## in the strings
df['score'] = df['raw'].str.extract('(\d\d\d\d\.\d)')
df['score']
0 3242.0 1 3453.7 2 2123.0 3 1123.6 4 2134.0 5 2345.6 Name: score, dtype: object
# In the column 'raw', extract the word in the strings
df['state'] = df['raw'].str.extract('([A-Z]\w{0,})')
df['state']
0 Arizona 1 Iowa 2 Oregon 3 Maryland 4 Florida 5 Georgia Name: state, dtype: object
df
raw | female | date | score | state | |
---|---|---|---|---|---|
0 | Arizona 1 2014-12-23 3242.0 | 1 | 2014-12-23 | 3242.0 | Arizona |
1 | Iowa 1 2010-02-23 3453.7 | 1 | 2010-02-23 | 3453.7 | Iowa |
2 | Oregon 0 2014-06-20 2123.0 | 0 | 2014-06-20 | 2123.0 | Oregon |
3 | Maryland 0 2014-03-14 1123.6 | 0 | 2014-03-14 | 1123.6 | Maryland |
4 | Florida 1 2013-01-15 2134.0 | 1 | 2013-01-15 | 2134.0 | Florida |
5 | Georgia 0 2012-07-14 2345.6 | 0 | 2012-07-14 | 2345.6 | Georgia |
6 rows × 5 columns