import pandas as pd
pd.__version__
'0.22.0'
ix
has been deprecated¶New in 0.20.0
# read the drinks dataset into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry', index_col='country')
drinks.head()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|
country | |||||
Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
Albania | 89 | 132 | 54 | 4.9 | Europe |
Algeria | 25 | 0 | 14 | 0.7 | Africa |
Andorra | 245 | 138 | 312 | 12.4 | Europe |
Angola | 217 | 57 | 45 | 5.9 | Africa |
# loc accesses by label
drinks.loc['Angola', 'spirit_servings']
57
# iloc accesses by position
drinks.iloc[4, 1]
57
# ix accesses by label OR position (newly deprecated)
drinks.ix['Angola', 1]
/Users/kevin/miniconda3/envs/pd22.0/lib/python3.5/site-packages/ipykernel_launcher.py:2: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
57
# alternative: use loc
drinks.loc['Angola', drinks.columns[1]]
57
# alternative: use iloc
drinks.iloc[drinks.index.get_loc('Angola'), 1]
57
# ix accesses by label OR position (newly deprecated)
drinks.ix[4, 'spirit_servings']
57
# alternative: use loc
drinks.loc[drinks.index[4], 'spirit_servings']
57
# alternative: use iloc
drinks.iloc[4, drinks.columns.get_loc('spirit_servings')]
57
isnull
and notnull
¶New in 0.21.0
# read the UFO dataset into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# check which values are missing
ufo.isnull().head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | False | True | False | False | False |
1 | False | True | False | False | False |
2 | False | True | False | False | False |
3 | False | True | False | False | False |
4 | False | True | False | False | False |
# check which values are not missing
ufo.notnull().head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | True | False | True | True | True |
1 | True | False | True | True | True |
2 | True | False | True | True | True |
3 | True | False | True | True | True |
4 | True | False | True | True | True |
# drop rows with missing values
ufo.dropna().head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
12 | Belton | RED | SPHERE | SC | 6/30/1939 20:00 |
19 | Bering Sea | RED | OTHER | AK | 4/30/1943 23:00 |
36 | Portsmouth | RED | FORMATION | VA | 7/10/1945 1:30 |
44 | Blairsden | GREEN | SPHERE | CA | 6/30/1946 19:00 |
82 | San Jose | BLUE | CHEVRON | CA | 7/15/1947 21:00 |
# fill in missing values
ufo.fillna(value='UNKNOWN').head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | UNKNOWN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | UNKNOWN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | UNKNOWN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | UNKNOWN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | UNKNOWN | LIGHT | NY | 4/18/1933 19:00 |
# new alias for isnull
ufo.isna().head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | False | True | False | False | False |
1 | False | True | False | False | False |
2 | False | True | False | False | False |
3 | False | True | False | False | False |
4 | False | True | False | False | False |
# new alias for notnull
ufo.notna().head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | True | False | True | True | True |
1 | True | False | True | True | True |
2 | True | False | True | True | True |
3 | True | False | True | True | True |
4 | True | False | True | True | True |
drop
now accepts "index" and "columns" keywords¶New in 0.21.0
# read the UFO dataset into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# old way to drop rows: specify labels and axis
ufo.drop([0, 1], axis=0).head()
ufo.drop([0, 1], axis='index').head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
5 | Valley City | NaN | DISK | ND | 9/15/1934 15:30 |
6 | Crater Lake | NaN | CIRCLE | CA | 6/15/1935 0:00 |
# new way to drop rows: specify index
ufo.drop(index=[0, 1]).head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
5 | Valley City | NaN | DISK | ND | 9/15/1934 15:30 |
6 | Crater Lake | NaN | CIRCLE | CA | 6/15/1935 0:00 |
# old way to drop columns: specify labels and axis
ufo.drop(['City', 'State'], axis=1).head()
ufo.drop(['City', 'State'], axis='columns').head()
Colors Reported | Shape Reported | Time | |
---|---|---|---|
0 | NaN | TRIANGLE | 6/1/1930 22:00 |
1 | NaN | OTHER | 6/30/1930 20:00 |
2 | NaN | OVAL | 2/15/1931 14:00 |
3 | NaN | DISK | 6/1/1931 13:00 |
4 | NaN | LIGHT | 4/18/1933 19:00 |
# new way to drop columns: specify columns
ufo.drop(columns=['City', 'State']).head()
Colors Reported | Shape Reported | Time | |
---|---|---|---|
0 | NaN | TRIANGLE | 6/1/1930 22:00 |
1 | NaN | OTHER | 6/30/1930 20:00 |
2 | NaN | OVAL | 2/15/1931 14:00 |
3 | NaN | DISK | 6/1/1931 13:00 |
4 | NaN | LIGHT | 4/18/1933 19:00 |
rename
and reindex
now accept "axis" keyword¶New in 0.21.0
# old way to rename columns: specify columns
ufo.rename(columns={'City':'CITY', 'State':'STATE'}).head()
CITY | Colors Reported | Shape Reported | STATE | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# new way to rename columns: specify mapper and axis
ufo.rename({'City':'CITY', 'State':'STATE'}, axis='columns').head()
CITY | Colors Reported | Shape Reported | STATE | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# note: mapper can be a function
ufo.rename(str.upper, axis='columns').head()
CITY | COLORS REPORTED | SHAPE REPORTED | STATE | TIME | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
New in 0.21.0
# create a small DataFrame
df = pd.DataFrame({'ID':[100, 101, 102, 103],
'quality':['good', 'very good', 'good', 'excellent']})
df
ID | quality | |
---|---|---|
0 | 100 | good |
1 | 101 | very good |
2 | 102 | good |
3 | 103 | excellent |
# old way to create an ordered category (deprecated)
df.quality.astype('category', categories=['good', 'very good', 'excellent'], ordered=True)
/Users/kevin/miniconda3/envs/pd22.0/lib/python3.5/site-packages/ipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
0 good 1 very good 2 good 3 excellent Name: quality, dtype: category Categories (3, object): [good < very good < excellent]
# new way to create an ordered category
from pandas.api.types import CategoricalDtype
quality_cat = CategoricalDtype(['good', 'very good', 'excellent'], ordered=True)
df['quality'] = df.quality.astype(quality_cat)
df.quality
0 good 1 very good 2 good 3 excellent Name: quality, dtype: category Categories (3, object): [good < very good < excellent]