These are the follow-up code samples for the following blog post: Pandas DataFrame by Example

In [38]:
import pandas as pd
import numpy as np
In [2]:
pd.__version__
Out[2]:
'0.23.1'
In [41]:
df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,45,33],
    'state':['iowa','dc','california','texas','washington','dc'],
    'num_children':[2,2,0,1,2,1],
    'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df
Out[41]:
name age state num_children num_pets
0 john 23 iowa 2 0
1 mary 78 dc 2 4
2 peter 22 california 0 0
3 jeff 19 texas 1 5
4 bill 45 washington 2 0
5 lisa 33 dc 1 0

Select rows by position

In [42]:
# select the first 2 rows
df.iloc[:2]
Out[42]:
name age state num_children num_pets
0 john 23 iowa 2 0
1 mary 78 dc 2 4
In [43]:
# select the last 2 rows
df.iloc[-2:]
Out[43]:
name age state num_children num_pets
4 bill 45 washington 2 0
5 lisa 33 dc 1 0

Select rows by index value

compare this with iloc above

In [44]:
# select rows up to and including the one
# with index=2
df.loc[:2]
Out[44]:
name age state num_children num_pets
0 john 23 iowa 2 0
1 mary 78 dc 2 4
2 peter 22 california 0 0

Select rows based upon the value of columns

In [45]:
# by a simple numeric condition
df[df["age"] > 30]
Out[45]:
name age state num_children num_pets
1 mary 78 dc 2 4
4 bill 45 washington 2 0
5 lisa 33 dc 1 0
In [46]:
# comparing the value of two columns
df[ df["num_pets"] > df[ "num_children"] ]
Out[46]:
name age state num_children num_pets
1 mary 78 dc 2 4
3 jeff 19 texas 1 5
In [47]:
# using boolean AND
df[ (df["age"] > 40) & (df["num_pets"] > 0) ]
Out[47]:
name age state num_children num_pets
1 mary 78 dc 2 4

select columns starting with

In [48]:
df[[colname for colname in df.columns if colname.startswith('n')]]
Out[48]:
name num_children num_pets
0 john 2 0
1 mary 2 4
2 peter 0 0
3 jeff 1 5
4 bill 2 0
5 lisa 1 0

select all columns but one

In [49]:
df.drop(['age'],axis=1)
Out[49]:
name state num_children num_pets
0 john iowa 2 0
1 mary dc 2 4
2 peter california 0 0
3 jeff texas 1 5
4 bill washington 2 0
5 lisa dc 1 0

Drop a column

In [50]:
df.drop(["age","num_children"],axis=1)
Out[50]:
name state num_pets
0 john iowa 0
1 mary dc 4
2 peter california 0
3 jeff texas 5
4 bill washington 0
5 lisa dc 0

Apply a function to every column (as aggregates)

Using numpy vectorized functions for numerical values

In [52]:
# get the mean for each of the selected columns
df[["age","num_pets","num_children"]].apply(lambda row: np.mean(row),axis=0).to_frame()
Out[52]:
0
age 36.666667
num_pets 1.500000
num_children 1.333333

Apply a function to every row (as aggregates)

Using numpy vectorized functions for numerical values

In [54]:
# sum columns age, num_pets and num_children for each row
df[["age","num_pets","num_children"]].apply(lambda row: np.sum(row),axis=1).to_frame()
Out[54]:
0
0 25
1 84
2 22
3 25
4 47
5 34

Apply a function elementwise using apply

In [55]:
df[["age"]].apply(lambda value: value*2)
Out[55]:
age
0 46
1 156
2 44
3 38
4 90
5 66
In [56]:
# certain numerical functions can also be used:
df[["age"]] * 2
Out[56]:
age
0 46
1 156
2 44
3 38
4 90
5 66
In [57]:
# also works for string values
df[["name"]].apply(lambda value: value.str.upper())
Out[57]:
name
0 JOHN
1 MARY
2 PETER
3 JEFF
4 BILL
5 LISA

Apply a function elementwise using map

use apply for DataFrame objects and map for Series objects

In [58]:
df['name'].map(lambda name: name.upper()).to_frame()
Out[58]:
name
0 JOHN
1 MARY
2 PETER
3 JEFF
4 BILL
5 LISA

Add new columns based on old ones

In [59]:
# simple sum of two columns
df["pets_and_children"] = df["num_pets"] + df["num_children"]
df
Out[59]:
name age state num_children num_pets pets_and_children
0 john 23 iowa 2 0 2
1 mary 78 dc 2 4 6
2 peter 22 california 0 0 0
3 jeff 19 texas 1 5 6
4 bill 45 washington 2 0 2
5 lisa 33 dc 1 0 1
In [60]:
df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,45,33],
    'state':['iowa','dc','california','texas','washington','dc'],
    'num_children':[2,2,0,1,2,1],
    'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df

# you can also use custom functions we used on "elementwise application"
df["name_uppercase"] = df[["name"]].apply(lambda name: name.str.upper())
df
Out[60]:
name age state num_children num_pets name_uppercase
0 john 23 iowa 2 0 JOHN
1 mary 78 dc 2 4 MARY
2 peter 22 california 0 0 PETER
3 jeff 19 texas 1 5 JEFF
4 bill 45 washington 2 0 BILL
5 lisa 33 dc 1 0 LISA

Shuffle rows

In [82]:
df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,45,33],
    'state':['iowa','dc','california','texas','washington','dc'],
    'num_children':[2,2,0,1,2,1],
    'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df

df.reindex(np.random.permutation(df.index))
Out[82]:
name age state num_children num_pets
4 bill 45 washington 2 0
3 jeff 19 texas 1 5
2 peter 22 california 0 0
1 mary 78 dc 2 4
5 lisa 33 dc 1 0
0 john 23 iowa 2 0

Iterate over all rows

In [83]:
for index,row in df.iterrows():
    print("{0} has name: {1}".format(index,row["name"]))
0 has name: john
1 has name: mary
2 has name: peter
3 has name: jeff
4 has name: bill
5 has name: lisa

Randomly sample rows

In [84]:
# sample 10 rows from df
random_indices = np.random.choice(df.index.values, 4, replace=False)

# iloc allows you to retrieve rows by their numeric indices
sampled_df = df.iloc[random_indices]
sampled_df
Out[84]:
name age state num_children num_pets
2 peter 22 california 0 0
3 jeff 19 texas 1 5
1 mary 78 dc 2 4
5 lisa 33 dc 1 0

Sort a dataframe

In [85]:
# sort by age, largest first
df.sort_values("age",ascending=False )
Out[85]:
name age state num_children num_pets
1 mary 78 dc 2 4
4 bill 45 washington 2 0
5 lisa 33 dc 1 0
0 john 23 iowa 2 0
2 peter 22 california 0 0
3 jeff 19 texas 1 5
In [86]:
# sort by num_pets descending then sort by age ascending
df.sort_values( ["num_pets","age"], ascending=[False,True] )
Out[86]:
name age state num_children num_pets
3 jeff 19 texas 1 5
1 mary 78 dc 2 4
2 peter 22 california 0 0
0 john 23 iowa 2 0
5 lisa 33 dc 1 0
4 bill 45 washington 2 0

custom sort

In [100]:
df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,12,33],
    'state':['N/A','dc','california','texas','N/A','dc']
})

# sorting columns
df=df[['name','age','state']]
df
Out[100]:
name age state
0 john 23 N/A
1 mary 78 dc
2 peter 22 california
3 jeff 19 texas
4 bill 12 N/A
5 lisa 33 dc
In [104]:
def state_to_rank(state):
    if state=="N/A":
        return 1
    else:
        return 0
    
df['rank'] = df['state'].map(lambda x: state_to_rank(x))

df.sort_values(by=['rank','age']).drop(['rank'],axis=1).reset_index(drop=True)
Out[104]:
name age state
0 jeff 19 texas
1 peter 22 california
2 lisa 33 dc
3 mary 78 dc
4 bill 12 N/A
5 john 23 N/A

Perform complex selections using lambdas

In [66]:
df[df.apply(lambda row: row['name'].startswith('j'),axis=1)]
Out[66]:
name age state num_children num_pets
0 john 23 iowa 2 0
3 jeff 19 texas 1 5

Change column names

In [68]:
# use inplace=True if you want to mutate the current dataframe
df.rename(columns={"age":"age_years"} )
Out[68]:
name age_years state num_children num_pets
0 john 23 iowa 2 0
1 mary 78 dc 2 4
2 peter 22 california 0 0
3 jeff 19 texas 1 5
4 bill 45 washington 2 0
5 lisa 33 dc 1 0

Change column dtype

In [69]:
df['num_children'].dtype
Out[69]:
dtype('int64')
In [72]:
# we don't need 64 bits for num_children
df['num_children'] = df['num_children'].astype('int32')
df['num_children'].dtype

# it looks the same but takes less space
df
Out[72]:
name age state num_children num_pets
0 john 23 iowa 2 0
1 mary 78 dc 2 4
2 peter 22 california 0 0
3 jeff 19 texas 1 5
4 bill 45 washington 2 0
5 lisa 33 dc 1 0

Veryfing that the dataframe includes specific values

This is done using the .isin() method, which returns a boolean dataframe to indicate where the passed values are.

In [73]:
df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,45,33],
    'state':['iowa','dc','california','texas','washington','dc'],
    'num_children':[2,2,0,1,2,1],
    'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]

# if the method is passed a simple list, it matches
# those values anywhere in the dataframe 
df.isin([2,4])
Out[73]:
name age state num_children num_pets
0 False False False True False
1 False False False True True
2 False False False False False
3 False False False False False
4 False False False True False
5 False False False False False
In [74]:
# you can also pass a dict or another dataframe
# as argument
df.isin({'num_pets':[4,5]})
Out[74]:
name age state num_children num_pets
0 False False False False False
1 False False False False True
2 False False False False False
3 False False False False True
4 False False False False False
5 False False False False False

Create an empty Dataframe and append rows one by one

In [75]:
# set column names and dtypes
new_df = pd.DataFrame(columns=['col_a','col_b']).astype({'col_a':'float32', 'col_b':'int8'})


# must reassign since the append method does not work in place
new_df = new_df.append({'col_a':5,'col_b':10}, ignore_index=True)
new_df = new_df.append({'col_a':1,'col_b':100}, ignore_index=True)

new_df
Out[75]:
col_a col_b
0 5.0 10
1 1.0 100

Create from list of dicts

In [76]:
new_df = pd.DataFrame(columns=['id','name'])

data_dict = [
    {'id':1,'name':"john"},
    {'id':2,'name':"mary"},
    {'id':3,'name':"peter"}
]

# must reassign since the append method does not work in place

new_df = new_df.from_records(data_dict)
new_df
Out[76]:
id name
0 1 john
1 2 mary
2 3 peter

converting types

In [77]:
df = pd.DataFrame({
    'name':['john','mary','peter'],
    "date_of_birth": ['27/05/2002','10/10/1999','01/04/1985']
})
df
Out[77]:
date_of_birth name
0 27/05/2002 john
1 10/10/1999 mary
2 01/04/1985 peter
In [79]:
df['date_of_birth']=pd.to_datetime(df['date_of_birth'],format='%d/%m/%Y')
df
Out[79]:
date_of_birth name
0 2002-05-27 john
1 1999-10-10 mary
2 1985-04-01 peter
In [80]:
df.dtypes
Out[80]:
date_of_birth    datetime64[ns]
name                     object
dtype: object
In [ ]: