import pandas as pd
import os
import matplotlib
%matplotlib inline
print pd.__version__
print matplotlib.__version__
0.20.3 2.2.2
# reading table
# making seperator as comma
df = pd.read_table(
'http://bit.ly/uforeports',
sep=','
)
df.head(5)
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# isnull() method returns a df of T/F considering if NaN occurs in a particular cell
df.isnull().head()
# notnull() is just inverse of isnull()
# df.notnull().head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | False | True | False | False | False |
1 | False | True | False | False | False |
2 | False | True | False | False | False |
3 | False | True | False | False | False |
4 | False | True | False | False | False |
# pandas converts T to 1 and F to 0
# since isnull() returns T to whereever NaN exists
# .sum() over it gives the total number of NaN in the dataframe
df.isnull().sum()
City 25 Colors Reported 15359 Shape Reported 2644 State 0 Time 0 dtype: int64
# Handling Missing Values
## Drop Missing Values
# dropna() method drops al those rows from the dataset where anyof the columns holds a NaN values
df.dropna(how='any')
# (how='all') - drop only if all the values in the row are missing
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
12 | Belton | RED | SPHERE | SC | 6/30/1939 20:00 |
19 | Bering Sea | RED | OTHER | AK | 4/30/1943 23:00 |
36 | Portsmouth | RED | FORMATION | VA | 7/10/1945 1:30 |
44 | Blairsden | GREEN | SPHERE | CA | 6/30/1946 19:00 |
82 | San Jose | BLUE | CHEVRON | CA | 7/15/1947 21:00 |
84 | Modesto | BLUE | DISK | CA | 8/8/1947 22:00 |
91 | Scipio | RED | SPHERE | IN | 5/10/1948 19:00 |
111 | Tarrant City | ORANGE | CIRCLE | AL | 8/15/1949 22:00 |
129 | Napa | GREEN | DISK | CA | 6/10/1950 0:00 |
138 | Coeur d'Alene | ORANGE | CIGAR | ID | 7/2/1950 13:00 |
152 | Irving | BLUE | DISK | KS | 4/15/1951 0:30 |
157 | Greenville | GREEN | DISK | MS | 6/15/1951 20:30 |
163 | Green River | GREEN | SPHERE | WY | 7/3/1951 12:00 |
164 | Provo | BLUE | DISK | UT | 7/10/1951 23:30 |
174 | Greenville | ORANGE | TRIANGLE | TX | 4/15/1952 16:00 |
178 | Norfolk | RED | FIREBALL | VA | 6/1/1952 22:00 |
202 | Arlington | GREEN | OVAL | VA | 7/13/1952 21:00 |
226 | Cambridge | RED | SPHERE | MA | 4/1/1953 18:00 |
229 | Midwest City | YELLOW | FIREBALL | OK | 4/15/1953 16:00 |
238 | Cleveland | RED | FIREBALL | OH | 6/30/1953 0:00 |
249 | Artesia | ORANGE | OTHER | NM | 8/15/1953 19:00 |
256 | Pendleton | GREEN | DISK | IN | 11/21/1953 22:30 |
288 | St. Louis Airport | RED | OVAL | MO | 7/1/1954 21:00 |
289 | Los Angeles | RED | CIRCLE | CA | 7/1/1954 22:00 |
304 | Beaumont | RED | DISK | TX | 9/9/1954 12:30 |
311 | Red Bank | ORANGE | CIRCLE | NJ | 12/15/1954 23:10 |
314 | Holbrook | YELLOW | EGG | MA | 5/1/1955 15:00 |
323 | Terre Haute | ORANGE | CIRCLE | IN | 6/15/1955 0:00 |
354 | Memphis | ORANGE | CYLINDER | TN | 6/1/1956 20:00 |
363 | Vista | ORANGE | CIGAR | CA | 6/15/1956 19:15 |
... | ... | ... | ... | ... | ... |
18110 | Gilbert | RED | LIGHT | AZ | 12/11/2000 23:45 |
18115 | Madison | RED GREEN | TEARDROP | OH | 12/12/2000 18:45 |
18117 | Concord | RED GREEN | CIRCLE | NH | 12/12/2000 23:10 |
18121 | Redwood Valley | BLUE | LIGHT | CA | 12/15/2000 1:09 |
18123 | Waldorf | GREEN | LIGHT | MD | 12/15/2000 5:30 |
18127 | North Pole | ORANGE | SPHERE | AK | 12/15/2000 16:36 |
18129 | Beaver | RED | DISK | PA | 12/15/2000 18:25 |
18134 | Graham | GREEN | LIGHT | WA | 12/16/2000 0:20 |
18141 | Sebastopol | YELLOW | CIRCLE | CA | 12/17/2000 18:00 |
18144 | Medina | ORANGE | TRIANGLE | OH | 12/17/2000 19:30 |
18148 | Highland Park | BLUE | VARIOUS | NJ | 12/18/2000 2:30 |
18158 | Woodland | ORANGE | LIGHT | CA | 12/19/2000 23:30 |
18167 | Gilbert | RED | CIRCLE | AZ | 12/21/2000 19:05 |
18170 | MM 110 | BLUE | CIRCLE | AZ | 12/22/2000 3:00 |
18172 | Toomsuba | RED | OVAL | MS | 12/23/2000 4:00 |
18177 | Chandler | RED | LIGHT | AZ | 12/23/2000 22:00 |
18181 | Fortuna | ORANGE | CIRCLE | CA | 12/24/2000 18:00 |
18184 | Plymouth | GREEN | FIREBALL | OH | 12/24/2000 22:00 |
18191 | Fallston | RED | VARIOUS | MD | 12/25/2000 19:15 |
18192 | Atlanta | ORANGE | LIGHT | GA | 12/25/2000 20:30 |
18194 | Walpole | GREEN | FIREBALL | NH | 12/26/2000 18:20 |
18195 | Walpole | GREEN | FIREBALL | NH | 12/26/2000 18:20 |
18196 | Brocton | GREEN | OVAL | MA | 12/26/2000 18:23 |
18197 | Greenfiled | GREEN | FIREBALL | MA | 12/26/2000 18:30 |
18210 | Monument Valley | BLUE | OTHER | UT | 12/28/2000 17:51 |
18213 | Pasadena | GREEN | FIREBALL | CA | 12/28/2000 19:10 |
18216 | Garden Grove | ORANGE | LIGHT | CA | 12/29/2000 16:10 |
18220 | Shasta Lake | BLUE | DISK | CA | 12/29/2000 20:30 |
18233 | Anchorage | RED | VARIOUS | AK | 12/31/2000 21:00 |
18239 | Eagle River | RED | LIGHT | WI | 12/31/2000 23:45 |
2486 rows × 5 columns
# drop a row only if any of the City, Colors Reported columns have NaN value
df.dropna(subset=['City', 'Colors Reported'], how='any').shape
(2877, 5)
print df.City.value_counts().head(2)
print
print df.City.isnull().sum()
print
print df.City.fillna(value='Seattle', inplace=False).value_counts().head(2)
print "\n\n187+25 = 212"
Seattle 187 New York City 161 Name: City, dtype: int64 25 Seattle 212 New York City 161 Name: City, dtype: int64 187+25 = 212