import pandas as pd
import numpy as np
import builtins
def print(*args, **kwargs):
builtins.print(*args, **kwargs, end='\n\n')
A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index.
obj = pd.Series([4, 7, -5, 3])
print(obj)
print(obj.index)
print(obj.values)
0 4 1 7 2 -5 3 3 dtype: int64 RangeIndex(start=0, stop=4, step=1) [ 4 7 -5 3]
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'a', 'c', 'b'])
print(obj2)
print(obj2.index)
d 4 a 7 c -5 b 3 dtype: int64 Index(['d', 'a', 'c', 'b'], dtype='object')
obj2['a']
7
obj2[['c', 'a', 'd']]
c -5 a 7 d 4 dtype: int64
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
print(obj3)
Ohio 35000 Texas 71000 Oregon 16000 Utah 5000 dtype: int64
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
print(obj4)
California NaN Ohio 35000.0 Oregon 16000.0 Texas 71000.0 dtype: float64
print(obj4.isnull())
print(obj4.notnull())
California True Ohio False Oregon False Texas False dtype: bool California False Ohio True Oregon True Texas True dtype: bool
print(obj3)
print(obj4)
print(obj3 + obj4)
Ohio 35000 Texas 71000 Oregon 16000 Utah 5000 dtype: int64 California NaN Ohio 35000.0 Oregon 16000.0 Texas 71000.0 dtype: float64 California NaN Ohio 70000.0 Oregon 32000.0 Texas 142000.0 Utah NaN dtype: float64
obj4.name = 'population'
obj4.index.name = 'state'
print(obj4)
state California NaN Ohio 35000.0 Oregon 16000.0 Texas 71000.0 Name: population, dtype: float64
print(obj)
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
print(obj)
0 4 1 7 2 -5 3 3 dtype: int64 Bob 4 Steve 7 Jeff -5 Ryan 3 dtype: int64
A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.).
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
print(frame)
state year pop 0 Ohio 2000 1.5 1 Ohio 2001 1.7 2 Ohio 2002 3.6 3 Nevada 2001 2.4 4 Nevada 2002 2.9 5 Nevada 2003 3.2
pd.DataFrame(data, columns=['year', 'state', 'pop'])
year | state | pop | |
---|---|---|---|
0 | 2000 | Ohio | 1.5 |
1 | 2001 | Ohio | 1.7 |
2 | 2002 | Ohio | 3.6 |
3 | 2001 | Nevada | 2.4 |
4 | 2002 | Nevada | 2.9 |
5 | 2003 | Nevada | 3.2 |
# u pass a column that isn’t contained in the dict,
# it will appear with missing values
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
index=['one', 'two', 'three', 'four', 'five', 'six'])
print(frame2)
print(frame2.columns)
year state pop debt one 2000 Ohio 1.5 NaN two 2001 Ohio 1.7 NaN three 2002 Ohio 3.6 NaN four 2001 Nevada 2.4 NaN five 2002 Nevada 2.9 NaN six 2003 Nevada 3.2 NaN Index(['year', 'state', 'pop', 'debt'], dtype='object')
print(frame['state'])
print(frame.year)
0 Ohio 1 Ohio 2 Ohio 3 Nevada 4 Nevada 5 Nevada Name: state, dtype: object 0 2000 1 2001 2 2002 3 2001 4 2002 5 2003 Name: year, dtype: int64
# Rows can also be retrieved by
# position or name with the special loc attribute
frame2.loc['three']
year 2002 state Ohio pop 3.6 debt NaN Name: three, dtype: object
print(frame2)
frame2['debt'] = 16.5
print(frame2)
frame2['debt'] = np.arange(6.)
print(frame2)
year state pop debt one 2000 Ohio 1.5 NaN two 2001 Ohio 1.7 NaN three 2002 Ohio 3.6 NaN four 2001 Nevada 2.4 NaN five 2002 Nevada 2.9 NaN six 2003 Nevada 3.2 NaN year state pop debt one 2000 Ohio 1.5 16.5 two 2001 Ohio 1.7 16.5 three 2002 Ohio 3.6 16.5 four 2001 Nevada 2.4 16.5 five 2002 Nevada 2.9 16.5 six 2003 Nevada 3.2 16.5 year state pop debt one 2000 Ohio 1.5 0.0 two 2001 Ohio 1.7 1.0 three 2002 Ohio 3.6 2.0 four 2001 Nevada 2.4 3.0 five 2002 Nevada 2.9 4.0 six 2003 Nevada 3.2 5.0
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
print(frame2)
year state pop debt one 2000 Ohio 1.5 NaN two 2001 Ohio 1.7 -1.2 three 2002 Ohio 3.6 NaN four 2001 Nevada 2.4 -1.5 five 2002 Nevada 2.9 -1.7 six 2003 Nevada 3.2 NaN
# Assigning a column that doesn’t exist will create a new column.
# The del keyword will delete columns as with a dict.
# As an example of del, I first add a new column of boolean
# values where the state column equals 'Ohio':
frame2['eastern'] = frame2['state'] == 'Ohio'
print(frame2)
del frame2['eastern']
frame2.columns
year state pop debt eastern one 2000 Ohio 1.5 NaN True two 2001 Ohio 1.7 -1.2 True three 2002 Ohio 3.6 NaN True four 2001 Nevada 2.4 -1.5 False five 2002 Nevada 2.9 -1.7 False six 2003 Nevada 3.2 NaN False
Index(['year', 'state', 'pop', 'debt'], dtype='object')
# another way is a nested dict of dicts
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
# If the nested dict is passed to the DataFrame,
# pandas will interpret the outer dict keys
# as the columns and the inner keys as the row indices
frame3 = pd.DataFrame(pop)
print(frame3)
Nevada Ohio 2001 2.4 1.7 2002 2.9 3.6 2000 NaN 1.5
# transpose dataframe
frame3.T
2001 | 2002 | 2000 | |
---|---|---|---|
Nevada | 2.4 | 2.9 | NaN |
Ohio | 1.7 | 3.6 | 1.5 |
frame3.index.name = 'year'; frame3.columns.name = 'state'
print(frame3)
state Nevada Ohio year 2001 2.4 1.7 2002 2.9 3.6 2000 NaN 1.5
axis = 0 => row
axis = 1 => column
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# dropping a column
drinks.drop('continent', axis=1).head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | |
---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 |
1 | Albania | 89 | 132 | 54 | 4.9 |
2 | Algeria | 25 | 0 | 14 | 0.7 |
3 | Andorra | 245 | 138 | 312 | 12.4 |
4 | Angola | 217 | 57 | 45 | 5.9 |
# dropping a row
drinks.drop(2, axis=0).head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
5 | Antigua & Barbuda | 102 | 128 | 45 | 4.9 | North America |
# default axis=0
# mean of each column
print(drinks.mean().shape)
drinks.mean()
(4,)
beer_servings 106.160622 spirit_servings 80.994819 wine_servings 49.450777 total_litres_of_pure_alcohol 4.717098 dtype: float64
# mean of each row
print(drinks.mean(axis=1).shape)
drinks.mean(axis=1)
(193,)
0 0.000 1 69.975 2 9.925 3 176.850 4 81.225 ... 188 110.925 189 29.000 190 1.525 191 14.375 192 22.675 Length: 193, dtype: float64
An important method on pandas objects is reindex, which means to create a new object with the data conformed to a new index
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
print(obj)
obj2 = obj.reindex(['a', 'b', 'c', 'd'])
print(obj2)
d 4.5 b 7.2 a -5.3 c 3.6 dtype: float64 a -5.3 b 7.2 c 3.6 d 4.5 dtype: float64
For ordered data like time series, it may be desirable to do some interpolation or filling of values when reindexing. The method option allows us to do this, using a method such as ffill, which forward-fills the values
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print(obj3)
obj3.reindex(range(6), method='ffill')
0 blue 2 purple 4 yellow dtype: object
0 blue 1 blue 2 purple 3 purple 4 yellow 5 yellow dtype: object
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
columns=['Ohio', 'Texas', 'California'])
frame.index.name = 'letters'
frame
Ohio | Texas | California | |
---|---|---|---|
letters | |||
a | 0 | 1 | 2 |
c | 3 | 4 | 5 |
d | 6 | 7 | 8 |
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2
Ohio | Texas | California | |
---|---|---|---|
letters | |||
a | 0.0 | 1.0 | 2.0 |
b | NaN | NaN | NaN |
c | 3.0 | 4.0 | 5.0 |
d | 6.0 | 7.0 | 8.0 |
# The columns can be reindexed with the columns keyword:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)
Texas | Utah | California | |
---|---|---|---|
letters | |||
a | 1 | NaN | 2 |
c | 4 | NaN | 5 |
d | 7 | NaN | 8 |
frame.reindex(['a', 'b', 'c', 'd'], columns=states)
Texas | Utah | California | |
---|---|---|---|
letters | |||
a | 1.0 | NaN | 2.0 |
b | NaN | NaN | NaN |
c | 4.0 | NaN | 5.0 |
d | 7.0 | NaN | 8.0 |
frame.reindex(['a', 'b', 'c', 'd'], columns=states, fill_value=0)
Texas | Utah | California | |
---|---|---|---|
letters | |||
a | 1 | 0 | 2 |
b | 0 | 0 | 0 |
c | 4 | 0 | 5 |
d | 7 | 0 | 8 |
drop method will return a new object with the indicated value or values deleted from an axis
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj
a 0.0 b 1.0 c 2.0 d 3.0 e 4.0 dtype: float64
new_obj = obj.drop('c')
new_obj
a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64
obj.drop(['d', 'c'])
a 0.0 b 1.0 e 4.0 dtype: float64
With DataFrame, index values can be deleted from either axis
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
# Calling drop with a sequence of labels will
# drop values from the row labels (axis 0):
data.drop(['Colorado', 'Ohio'])
one | two | three | four | |
---|---|---|---|---|
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
# drop values from columns: pass axis=1 or axis='columns'
data.drop(['two', 'four'], axis=1)
one | three | |
---|---|---|
Ohio | 0 | 2 |
Colorado | 4 | 6 |
Utah | 8 | 10 |
New York | 12 | 14 |
print(obj)
obj.drop('c', inplace=True)
print(obj)
a 0.0 b 1.0 c 2.0 d 3.0 e 4.0 dtype: float64 a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64
Series indexing (obj[...]) works analogously to NumPy array indexing, except you can use the Series’s index values instead of only integers.
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj
a 0.0 b 1.0 c 2.0 d 3.0 dtype: float64
print(obj['b'])
print(obj[1])
print(obj[2:4])
print(obj[['b', 'a', 'd']])
print(obj[[1, 3]])
1.0 1.0 c 2.0 d 3.0 dtype: float64 b 1.0 a 0.0 d 3.0 dtype: float64 b 1.0 d 3.0 dtype: float64
print(obj[obj < 2])
a 0.0 b 1.0 dtype: float64
Slicing with labels behaves differently than normal Python slicing in that the end‐point is inclusive
obj['b': 'c']
b 1.0 c 2.0 dtype: float64
Indexing into a DataFrame is for retrieving one or more columns either with a single value or sequence
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
print(data['two'])
print(data[['three', 'four']])
Ohio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int32 three four Ohio 2 3 Colorado 6 7 Utah 10 11 New York 14 15
The row selection syntax data[:2] is provided as a convenience. Passing a single element or a list to the [ ] operator selects columns.
data[:2]
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
data[data['three'] > 5]
one | two | three | four | |
---|---|---|---|---|
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
data < 5
one | two | three | four | |
---|---|---|---|---|
Ohio | True | True | True | True |
Colorado | True | False | False | False |
Utah | False | False | False | False |
New York | False | False | False | False |
Setting using these methods modifies the corresponding section of the Series
obj['b': 'c'] = 5
obj
a 0.0 b 5.0 c 5.0 d 3.0 dtype: float64
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
data[data < 5] = 0
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 0 | 0 | 0 |
Colorado | 0 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
For DataFrame label-indexing on the rows, I introduce the special indexing operators loc and iloc. They enable you to select a subset of the rows and columns from a DataFrame with NumPy-like notation using either axis labels (loc) or integers (iloc).
# select a single row and multiple columns by label
# loc[Row, col]
data.loc['Colorado', ['two', 'three']]
two 5 three 6 Name: Colorado, dtype: int32
# with integers using iloc
data.iloc[1, [1, 2]]
two 5 three 6 Name: Colorado, dtype: int32
print(data.iloc[2])
print(data.iloc[[1, 2], [3, 0, 1]])
one 8 two 9 three 10 four 11 Name: Utah, dtype: int32 four one two Colorado 7 0 5 Utah 11 8 9
print(data)
print(data.loc[:'Utah', 'two'])
one two three four Ohio 0 0 0 0 Colorado 0 5 6 7 Utah 8 9 10 11 New York 12 13 14 15 Ohio 0 Colorado 5 Utah 9 Name: two, dtype: int32
data.iloc[:, :3][data.three > 5]
one | two | three | |
---|---|---|---|
Colorado | 0 | 5 | 6 |
Utah | 8 | 9 | 10 |
New York | 12 | 13 | 14 |
An important pandas feature for some applications is the behavior of arithmetic between objects with different indexes. When you are adding together objects, if any index pairs are not the same, the respective index in the result will be the union of the index pairs. For users with database experience, this is similar to an automatic outer join on the index labels.
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
print(s1)
print(s2)
a 7.3 c -2.5 d 3.4 e 1.5 dtype: float64 a -2.1 c 3.6 e -1.5 f 4.0 g 3.1 dtype: float64
s1 + s2
a 5.2 c 1.1 d NaN e 0.0 f NaN g NaN dtype: float64
In the case of DataFrame, alignment is performed on both the rows and the columns
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df1)
print(df2)
b c d Ohio 0.0 1.0 2.0 Texas 3.0 4.0 5.0 Colorado 6.0 7.0 8.0 b d e Utah 0.0 1.0 2.0 Ohio 3.0 4.0 5.0 Texas 6.0 7.0 8.0 Oregon 9.0 10.0 11.0
df1 + df2
b | c | d | e | |
---|---|---|---|---|
Colorado | NaN | NaN | NaN | NaN |
Ohio | 3.0 | NaN | 6.0 | NaN |
Oregon | NaN | NaN | NaN | NaN |
Texas | 9.0 | NaN | 12.0 | NaN |
Utah | NaN | NaN | NaN | NaN |
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
print(df1)
print(df2)
A 0 1 1 2 B 0 3 1 4
df1 - df2
A | B | |
---|---|---|
0 | NaN | NaN |
1 | NaN | NaN |
In arithmetic operations between differently indexed objects, you might want to fill with a special value, like 0, when an axis label is found in one object but not the other
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list("abcd"))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
print(df1)
print(df2)
a b c d 0 0.0 1.0 2.0 3.0 1 4.0 5.0 6.0 7.0 2 8.0 9.0 10.0 11.0 a b c d e 0 0.0 1.0 2.0 3.0 4.0 1 5.0 NaN 7.0 8.0 9.0 2 10.0 11.0 12.0 13.0 14.0 3 15.0 16.0 17.0 18.0 19.0
# Adding these together results in NA values in the locations that don’t overlap
df1 + df2
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0.0 | 2.0 | 4.0 | 6.0 | NaN |
1 | 9.0 | NaN | 13.0 | 15.0 | NaN |
2 | 18.0 | 20.0 | 22.0 | 24.0 | NaN |
3 | NaN | NaN | NaN | NaN | NaN |
# Using the add method on df1,one pass df2 and an argument to fill_value
df1.add(df2, fill_value=0)
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0.0 | 2.0 | 4.0 | 6.0 | 4.0 |
1 | 9.0 | 5.0 | 13.0 | 15.0 | 9.0 |
2 | 18.0 | 20.0 | 22.0 | 24.0 | 14.0 |
3 | 15.0 | 16.0 | 17.0 | 18.0 | 19.0 |
# 1 / df1
# or
df1.rdiv(1)
a | b | c | d | |
---|---|---|---|---|
0 | inf | 1.000000 | 0.500000 | 0.333333 |
1 | 0.250 | 0.200000 | 0.166667 | 0.142857 |
2 | 0.125 | 0.111111 | 0.100000 | 0.090909 |
# operations between DataFrame and Series
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
index=("utah", "ohio", "texas", "oregon"))
series = frame.iloc[0]
print(frame)
print(series)
b d e utah 0.0 1.0 2.0 ohio 3.0 4.0 5.0 texas 6.0 7.0 8.0 oregon 9.0 10.0 11.0 b 0.0 d 1.0 e 2.0 Name: utah, dtype: float64
By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame’s columns, broadcasting down the rows
frame - series
b | d | e | |
---|---|---|---|
utah | 0.0 | 0.0 | 0.0 |
ohio | 3.0 | 3.0 | 3.0 |
texas | 6.0 | 6.0 | 6.0 |
oregon | 9.0 | 9.0 | 9.0 |
If an index value is not found in either the DataFrame’s columns or the Series’s index, the objects will be reindexed to form the union
series2 = pd.Series(range(3), index=list('bef'))
frame + series2
b | d | e | f | |
---|---|---|---|---|
utah | 0.0 | NaN | 3.0 | NaN |
ohio | 3.0 | NaN | 6.0 | NaN |
texas | 6.0 | NaN | 9.0 | NaN |
oregon | 9.0 | NaN | 12.0 | NaN |
# broadcasting over columns
series3 = frame['d']
series3
utah 1.0 ohio 4.0 texas 7.0 oregon 10.0 Name: d, dtype: float64
frame.sub(series3, axis='index')
b | d | e | |
---|---|---|---|
utah | -1.0 | 0.0 | 1.0 |
ohio | -1.0 | 0.0 | 1.0 |
texas | -1.0 | 0.0 | 1.0 |
oregon | -1.0 | 0.0 | 1.0 |
The axis number that you pass is the axis to match on. In this case we mean to match on the DataFrame’s row index (axis='index' or axis=0) and broadcast across.
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
b | d | e | |
---|---|---|---|
Utah | -1.846691 | -0.622582 | -1.542595 |
Ohio | 0.818697 | 0.730596 | 0.458859 |
Texas | 0.612735 | 0.593502 | -0.631133 |
Oregon | -0.067950 | -1.023214 | 0.115138 |
np.abs(frame)
b | d | e | |
---|---|---|---|
Utah | 1.846691 | 0.622582 | 1.542595 |
Ohio | 0.818697 | 0.730596 | 0.458859 |
Texas | 0.612735 | 0.593502 | 0.631133 |
Oregon | 0.067950 | 1.023214 | 0.115138 |
# Another frequent operation is applying a function on
# one-dimensional arrays to each column or row.
# DataFrame’s apply method does exactly this
f = lambda x: x.max() - x.min()
frame.apply(f)
b 2.665389 d 1.753811 e 2.001455 dtype: float64
Here the function f, which computes the difference between the maximum and minimum of a Series, is invoked once on each column in frame. The result is a Series having the columns of frame as its index.
# If you pass axis='columns' to apply,
# the function will be invoked once per row
frame.apply(f, axis='columns')
Utah 1.224110 Ohio 0.359838 Texas 1.243868 Oregon 1.138353 dtype: float64
# returning a Series with multiple values
def f(x):
return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)
b | d | e | |
---|---|---|---|
min | -1.846691 | -1.023214 | -1.542595 |
max | 0.818697 | 0.730596 | 0.458859 |
Element-wise Python functions can be used, too. Suppose you wanted to compute a formatted string from each floating-point value in frame. You can do this with applymap
format = lambda x: f'{x:.2f}'
print(frame)
frame.applymap(format)
b d e Utah -1.846691 -0.622582 -1.542595 Ohio 0.818697 0.730596 0.458859 Texas 0.612735 0.593502 -0.631133 Oregon -0.067950 -1.023214 0.115138
b | d | e | |
---|---|---|---|
Utah | -1.85 | -0.62 | -1.54 |
Ohio | 0.82 | 0.73 | 0.46 |
Texas | 0.61 | 0.59 | -0.63 |
Oregon | -0.07 | -1.02 | 0.12 |
The reason for the name applymap is that Series has a map method for applying an element-wise function
frame['e'].map(format)
Utah -1.54 Ohio 0.46 Texas -0.63 Oregon 0.12 Name: e, dtype: object
Sorting a dataset by some criterion is another important built-in operation. To sort lexicographically by row or column index, use the sort_index method, which returns a new, sorted object
obj = pd.Series(range(4), index=['b', 'd', 'a', 'c'])
obj
b 0 d 1 a 2 c 3 dtype: int64
obj.sort_index()
a 2 b 0 c 3 d 1 dtype: int64
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
columns=['d', 'a', 'b', 'c'])
frame.sort_index()
d | a | b | c | |
---|---|---|---|---|
one | 4 | 5 | 6 | 7 |
three | 0 | 1 | 2 | 3 |
frame.sort_index(axis=1)
a | b | c | d | |
---|---|---|---|---|
three | 1 | 2 | 3 | 0 |
one | 5 | 6 | 7 | 4 |
frame.sort_index(axis=1, ascending=False)
d | c | b | a | |
---|---|---|---|---|
three | 0 | 3 | 2 | 1 |
one | 4 | 7 | 6 | 5 |
To sort a Series by its values, use its sort_values method. Any missing values are sorted to the end of the Series by default
obj = pd.Series([-4, 7, 3, 2])
obj.sort_values()
0 -4 3 2 2 3 1 7 dtype: int64
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()
4 -3.0 5 2.0 0 4.0 2 7.0 1 NaN 3 NaN dtype: float64
When sorting a DataFrame, you can use the data in one or more columns as the sort keys. To do so, pass one or more column names to the by option of sort_values
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
b | a | |
---|---|---|
0 | 4 | 0 |
1 | 7 | 1 |
2 | -3 | 0 |
3 | 2 | 1 |
frame.sort_values(by='b')
b | a | |
---|---|---|
2 | -3 | 0 |
3 | 2 | 1 |
0 | 4 | 0 |
1 | 7 | 1 |
frame.sort_values(by=['a', 'b'])
b | a | |
---|---|---|
2 | -3 | 0 |
0 | 4 | 0 |
3 | 2 | 1 |
1 | 7 | 1 |
Ranking assigns ranks from one through the number of valid data points in an array. The rank methods for Series and DataFrame are the place to look; by default rank breaks ties by assigning each group the mean rank
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()
0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64
# assigning rank according to the order they're observed first in the data
obj.rank(method='first')
0 6.0 1 1.0 2 7.0 3 4.0 4 3.0 5 2.0 6 5.0 dtype: float64
obj.rank(ascending=False, method='max')
0 2.0 1 7.0 2 2.0 3 4.0 4 5.0 5 6.0 6 4.0 dtype: float64
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})
frame
b | a | c | |
---|---|---|---|
0 | 4.3 | 0 | -2.0 |
1 | 7.0 | 1 | 5.0 |
2 | -3.0 | 0 | 8.0 |
3 | 2.0 | 1 | -2.5 |
frame.rank(axis='columns')
b | a | c | |
---|---|---|---|
0 | 3.0 | 2.0 | 1.0 |
1 | 3.0 | 1.0 | 2.0 |
2 | 1.0 | 2.0 | 3.0 |
3 | 3.0 | 2.0 | 1.0 |
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj
a 0 a 1 b 2 b 3 c 4 dtype: int64
obj.index.is_unique
False
obj.a
a 0 a 1 dtype: int64
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df
0 | 1 | 2 | |
---|---|---|---|
a | -2.054623 | 0.654964 | -0.431453 |
a | 0.654782 | 0.776429 | 1.175684 |
b | -0.485586 | -0.960232 | -1.368606 |
b | 0.205797 | 0.266294 | 2.007870 |
df.loc['a']
0 | 1 | 2 | |
---|---|---|---|
a | -2.054623 | 0.654964 | -0.431453 |
a | 0.654782 | 0.776429 | 1.175684 |
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan],
[0.75, -1.43]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 7.10 | -4.50 |
c | NaN | NaN |
d | 0.75 | -1.43 |
df.sum()
one 9.25 two -5.93 dtype: float64
df.sum(axis='columns')
a 1.40 b 2.60 c 0.00 d -0.68 dtype: float64
df.sum(axis='columns', skipna=False)
a NaN b 2.60 c NaN d -0.68 dtype: float64
Some methods, like idxmin and idxmax, return indirect statistics like the index value where the minimum or maximum values are attained
print(df.idxmax())
print(df.idxmin())
one b two d dtype: object one d two b dtype: object
# accmulation
df.cumsum()
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 8.50 | -4.50 |
c | NaN | NaN |
d | 9.25 | -5.93 |
Another type of method is neither a reduction nor an accumulation. describe is one such example, producing multiple summary statistics in one shot
df.describe()
one | two | |
---|---|---|
count | 3.000000 | 2.000000 |
mean | 3.083333 | -2.965000 |
std | 3.493685 | 2.170818 |
min | 0.750000 | -4.500000 |
25% | 1.075000 | -3.732500 |
50% | 1.400000 | -2.965000 |
75% | 4.250000 | -2.197500 |
max | 7.100000 | -1.430000 |
# On non-numeric data, describe produces alternative summary statistics:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()
count 16 unique 3 top a freq 8 dtype: object
df
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 7.10 | -4.50 |
c | NaN | NaN |
d | 0.75 | -1.43 |
import pandas_datareader as web
all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
price = pd.DataFrame({ticker: data['Adj Close']
for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
for ticker, data in all_data.items()})
returns = price.pct_change()
returns.tail()
AAPL | IBM | MSFT | GOOG | |
---|---|---|---|---|
Date | ||||
2019-12-17 | 0.001965 | 0.000671 | -0.005401 | -0.004445 |
2019-12-18 | -0.002389 | 0.001416 | -0.002069 | -0.001845 |
2019-12-19 | 0.001001 | 0.001116 | 0.008681 | 0.002528 |
2019-12-20 | -0.002071 | 0.007655 | 0.010918 | -0.004757 |
2019-12-23 | 0.016318 | -0.000295 | 0.000000 | -0.000556 |
The corr method of Series computes the correlation of the overlapping, non-NA, aligned-by-index values in two Series. Relatedly, cov computes the covariance
print(returns['MSFT'].cov(returns['IBM']))
print(returns['MSFT'].corr(returns['IBM']))
9.227138328874263e-05 0.48490365831130916
returns.MSFT.corr(returns.IBM)
0.48490365831130916
returns.corr()
AAPL | IBM | MSFT | GOOG | |
---|---|---|---|---|
AAPL | 1.000000 | 0.399340 | 0.573224 | 0.521823 |
IBM | 0.399340 | 1.000000 | 0.484904 | 0.410386 |
MSFT | 0.573224 | 0.484904 | 1.000000 | 0.658704 |
GOOG | 0.521823 | 0.410386 | 0.658704 | 1.000000 |
returns.cov()
AAPL | IBM | MSFT | GOOG | |
---|---|---|---|---|
AAPL | 0.000245 | 0.000081 | 0.000132 | 0.000124 |
IBM | 0.000081 | 0.000168 | 0.000092 | 0.000080 |
MSFT | 0.000132 | 0.000092 | 0.000216 | 0.000146 |
GOOG | 0.000124 | 0.000080 | 0.000146 | 0.000229 |
# corrwith method - compute pairwise correlations
# between a DataFrame’s columns or rows with another Series oDataFrame’sme
returns.corrwith(returns.IBM)
AAPL 0.399340 IBM 1.000000 MSFT 0.484904 GOOG 0.410386 dtype: float64
# unique - gives you an array of the unique values in a Series
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj.unique()
array(['c', 'a', 'd', 'b'], dtype=object)
# value_counts - computes a Series containing value frequencies
pd.value_counts(obj.values, sort=True)
c 3 a 3 b 2 d 1 dtype: int64
# isin - performs a vectorized set membership check and can be useful in filtering a
# dataset down to a subset of values in a Series or column in a DataFrame
obj
0 c 1 a 2 d 3 a 4 a 5 b 6 b 7 c 8 c dtype: object
mask = obj.isin(['b', 'c'])
mask
0 True 1 False 2 False 3 False 4 False 5 True 6 True 7 True 8 True dtype: bool
obj[mask]
0 c 5 b 6 b 7 c 8 c dtype: object
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
'Qu2': [2, 3, 1, 2, 3],
'Qu3': [1, 5, 2, 4, 4]})
data
Qu1 | Qu2 | Qu3 | |
---|---|---|---|
0 | 1 | 2 | 1 |
1 | 3 | 3 | 5 |
2 | 4 | 1 | 2 |
3 | 3 | 2 | 4 |
4 | 4 | 3 | 4 |
print(pd.value_counts(data.Qu1))
print(pd.value_counts(data.Qu2))
print(pd.value_counts(data.Qu3))
print(data.apply(pd.value_counts).fillna(0))
# index - distinct values in the df
# column values - how many times the distinct value occur in that column
4 2 3 2 1 1 Name: Qu1, dtype: int64 3 2 2 2 1 1 Name: Qu2, dtype: int64 4 2 5 1 2 1 1 1 Name: Qu3, dtype: int64 Qu1 Qu2 Qu3 1 1.0 1.0 1.0 2 0.0 2.0 1.0 3 2.0 2.0 0.0 4 2.0 0.0 2.0 5 0.0 0.0 1.0