#!/usr/bin/env python # coding: utf-8 # #### An example of handling missing values in data. We will use a modified version of the video store data (__[Video_Store_3.csv](http://facweb.cs.depaul.edu/mobasher/classes/CSC478/data/Video_Store_3.csv)__) which contains some missing values. # In[1]: import numpy as np import pandas as pd # In[2]: vstable = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/Video_Store_3.csv", index_col=0, na_values=["?"]) vstable.shape # In[3]: vstable.head(10) # In[4]: vstable.info() # In[5]: vstable.describe(include="all") # In[6]: vstable.isnull()[0:10] # In[7]: vstable[vstable.isnull().any(axis=1)] # In[8]: vstable[vstable.Gender.isnull()] # In[9]: vstable[vstable.Age.isnull()] # In[10]: age_mean = vstable.Age.mean() vstable.Age.fillna(age_mean, axis=0, inplace=True) # In[11]: vstable.head(10) # In[12]: vstable.drop(vstable[vstable.Gender.isnull()].index, axis=0).head(10) # In[13]: vstable.head(10) # #### To permanently remove the rows with NaN Gender values, use the "inplace" parameter in the "drop" function. # In[14]: vstable.drop(vstable[vstable.Gender.isnull()].index, axis=0, inplace=True) vstable.head(10) # #### It is also possible to use the "dropna" function to drop all rows that have one or more NaN values. # In[15]: vstable2 = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/Video_Store_3.csv", index_col=0, na_values=["?"]) # In[16]: vstable2.head(10) # In[17]: vstable2.dropna(axis=0, inplace=True) vstable2.shape # In[18]: vstable2.head(10) # In[ ]: