# import
import pandas as pd
import numpy as np
2D ndarray A matrix of data, passing optional row and column labels
dict of arrays, lists, or tuples Each sequence becomes a column in the DataFrame. All sequences must be the same length.
NumPy structured/record array Treated as the “dict of arrays” case
dict of Series Each value becomes a column. Indexes from each Series are unioned together to form the result’s row index if no explicit index is passed.
dict of dicts Each inner dict becomes a column. Keys are unioned to form the row index as in the “dict of Series” case.
list of dicts or Series Each item becomes a row in the DataFrame. Union of dict keys or Series indexes become the DataFrame’s column labels
List of lists or tuples Treated as the “2D ndarray” case
Another DataFrame The DataFrame’s indexes are used unless different ones are passed
NumPy MaskedArray Like the “2D ndarray” case except masked values become NA/missing in the DataFrame result
state = ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada']
year = [2000, 2001, 2002, 2001, 2002]
pop = [1.5, 1.7, 3.6, 2.4, 2.9]
print(type(state), type(year), type(pop))
<class 'list'> <class 'list'> <class 'list'>
# creating dataframe
df = pd.DataFrame({'state':state, 'year':year, 'pop':pop})
print(df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5 entries, 0 to 4 Data columns (total 3 columns): pop 5 non-null float64 state 5 non-null object year 5 non-null int64 dtypes: float64(1), int64(1), object(1) memory usage: 200.0+ bytes None
print(df)
pop state year 0 1.5 Ohio 2000 1 1.7 Ohio 2001 2 3.6 Ohio 2002 3 2.4 Nevada 2001 4 2.9 Nevada 2002
sdata = {'state':state, 'year':year, 'pop':pop}
print(sdata,"\n",type(sdata))
{'year': [2000, 2001, 2002, 2001, 2002], 'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]} <class 'dict'>
df = pd.DataFrame(sdata, columns=['pop1', 'state1', 'year1']) # we can not rename columns like this, but create column names
# if doesn't exists
print(df)
Empty DataFrame Columns: [pop1, state1, year1] Index: []
df = pd.DataFrame(sdata, columns=['pop1', 'state', 'year']) # this will pick those columns from sdata which matched
print(df)
pop1 state year 0 NaN Ohio 2000 1 NaN Ohio 2001 2 NaN Ohio 2002 3 NaN Nevada 2001 4 NaN Nevada 2002
df = pd.DataFrame(sdata)
print(df.columns)
Index(['pop', 'state', 'year'], dtype='object')
# renaming columns and index
df.columns = ['pop1', 'state1', 'year1']
df.index = ['one', 'two', 'three', 'four', 'five']
print(df)
pop1 state1 year1 one 1.5 Ohio 2000 two 1.7 Ohio 2001 three 3.6 Ohio 2002 four 2.4 Nevada 2001 five 2.9 Nevada 2002
# stats about dataframe
print(df.index, "\n", df.shape, "\n", df.columns)
Index(['one', 'two', 'three', 'four', 'five'], dtype='object') (5, 3) Index(['pop1', 'state1', 'year1'], dtype='object')
df['pop1'] = 1.5
print(df)
pop1 state1 year1 one 1.5 Ohio 2000 two 1.5 Ohio 2001 three 1.5 Ohio 2002 four 1.5 Nevada 2001 five 1.5 Nevada 2002
df['pop1'] = range(5)
print(df)
pop1 state1 year1 one 0 Ohio 2000 two 1 Ohio 2001 three 2 Ohio 2002 four 3 Nevada 2001 five 4 Nevada 2002
# can access the data as
print(df['state1'])
one Ohio two Ohio three Ohio four Nevada five Nevada Name: state1, dtype: object
print(df.state1)
one Ohio two Ohio three Ohio four Nevada five Nevada Name: state1, dtype: object
# for deleting any columns
del df['pop1']
print(df)
state1 year1 one Ohio 2000 two Ohio 2001 three Ohio 2002 four Nevada 2001 five Nevada 2002
# transpose the dataframe
dft = df.T
print(dft)
one two three four five state1 Ohio Ohio Ohio Nevada Nevada year1 2000 2001 2002 2001 2002
# using columns as an index
df.index = df['year1']
del df['year1']
print(df)
state1 year1 2000 Ohio 2001 Ohio 2002 Ohio 2001 Nevada 2002 Nevada
df.columns.name, df.index.name
(None, 'year1')
df.columns
Index(['state1'], dtype='object')
# printing values
df.values
array([['Ohio'], ['Ohio'], ['Ohio'], ['Nevada'], ['Nevada']], dtype=object)
append Concatenate with additional Index objects, producing a new Index
diff Compute set difference as an Index
intersection Compute set intersection
union Compute set union
isin Compute boolean array indicating whether each value is contained in the passed collection
delete Compute new Index with element at index i deleted
drop Compute new index by deleting passed values
insert Compute new Index by inserting element at index i
is_monotonic Returns True if each element is greater than or equal to the previous element
is_unique Returns True if the Index has no duplicate values
unique Compute the array of unique values in the Index
# Series and DataFrames index are mutable
df.index
Int64Index([2000, 2001, 2002, 2001, 2002], dtype='int64', name='year1')
#df.index[2]=2009 # this will throw a error
index New sequence to use as index. Can be Index instance or any other sequence-like Python data structure. An Index will be used exactly as is without any copying
method Interpolation (fill) method, see Table 5-4 for options.
fill_value Substitute value to use when introducing missing data by reindexing
limit When forward- or backfilling, maximum size gap to fill
level Match simple Index on level of MultiIndex, otherwise select subset of
copy Do not copy underlying data if new index is equivalent to old index. True by default (i.e. always copy data).
print(df)
state1 year1 2000 Ohio 2001 Ohio 2002 Ohio 2001 Nevada 2002 Nevada
df.index
Int64Index([2000, 2001, 2002, 2001, 2002], dtype='int64', name='year1')
# df2 = df.reindex([2000, 2001, 2002, 2001, 2002, 2009])
# this will throw an value error, as index should be unique
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California'])
print(frame)
Ohio Texas California a 0 1 2 c 3 4 5 d 6 7 8
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)
Ohio Texas California a 0.0 1.0 2.0 b NaN NaN NaN c 3.0 4.0 5.0 d 6.0 7.0 8.0
# likewise let's revert the df
df['year'] = df.index
df.index = [0,1,2,3,4]
print(df)
state1 year 0 Ohio 2000 1 Ohio 2001 2 Ohio 2002 3 Nevada 2001 4 Nevada 2002
# now we can reindex this df
df2 = df.reindex([1,2,3,4,5,6,7]) # again, reindex will first look into the df and then create the new
print(df2) # as here, it will keep 1,2,3,4 and drop 0 and create new 5,6,7 index
state1 year 1 Ohio 2001.0 2 Ohio 2002.0 3 Nevada 2001.0 4 Nevada 2002.0 5 NaN NaN 6 NaN NaN 7 NaN NaN
# better and faster way to do that is -
df3=df2.ix[[1,2,3,4,6]]
print(df3)
state1 year 1 Ohio 2001.0 2 Ohio 2002.0 3 Nevada 2001.0 4 Nevada 2002.0 6 NaN NaN
# CAN ALter the columns as well
new_columns = ['state1', 'year', 'population']
df4 = df3.ix[[1,2,3,4,6], new_columns]
print(df4)
state1 year population 1 Ohio 2001.0 NaN 2 Ohio 2002.0 NaN 3 Nevada 2001.0 NaN 4 Nevada 2002.0 NaN 6 NaN NaN NaN
df4.columns
Index(['state1', 'year', 'population'], dtype='object')
# renaming columns
df4.columns = ['state', 'year', 'pop']
print(df4)
state year pop 1 Ohio 2001.0 NaN 2 Ohio 2002.0 NaN 3 Nevada 2001.0 NaN 4 Nevada 2002.0 NaN 6 NaN NaN NaN
# dropping index or columns
df5=df4.drop([3])
print(df5)
state year pop 1 Ohio 2001.0 NaN 2 Ohio 2002.0 NaN 4 Nevada 2002.0 NaN 6 NaN NaN NaN
df5 = df5.drop(['pop'], axis=1)
print(df5)
state year 1 Ohio 2001.0 2 Ohio 2002.0 4 Nevada 2002.0 6 NaN NaN
df4
state | year | pop | |
---|---|---|---|
1 | Ohio | 2001.0 | NaN |
2 | Ohio | 2002.0 | NaN |
3 | Nevada | 2001.0 | NaN |
4 | Nevada | 2002.0 | NaN |
6 | NaN | NaN | NaN |
df4[df4['state']=='Ohio']
state | year | pop | |
---|---|---|---|
1 | Ohio | 2001.0 | NaN |
2 | Ohio | 2002.0 | NaN |
df4[['state', 'year']]
state | year | |
---|---|---|
1 | Ohio | 2001.0 |
2 | Ohio | 2002.0 |
3 | Nevada | 2001.0 |
4 | Nevada | 2002.0 |
6 | NaN | NaN |
df4['year'][df4['state']=='Ohio']=2004
C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy if __name__ == '__main__':
df4
state | year | pop | |
---|---|---|---|
1 | Ohio | 2004.0 | NaN |
2 | Ohio | 2004.0 | NaN |
3 | Nevada | 2001.0 | NaN |
4 | Nevada | 2002.0 | NaN |
6 | NaN | NaN | NaN |
# ix enables you to select a subset of the rows and columns from a DataFrame with NumPy like notation plus axis labels
df4.ix[[1,2],['state']]
state | |
---|---|
1 | Ohio |
2 | Ohio |
df4.ix[[3,6],[0,2]]
state | pop | |
---|---|---|
3 | Nevada | NaN |
6 | NaN | NaN |
df4.ix[df4['year']<2003,[0,2]]
state | pop | |
---|---|---|
3 | Nevada | NaN |
4 | Nevada | NaN |
obj[val] Select single column or sequence of columns from the DataFrame. Special case con-veniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame (set values based on some criterion).
obj.ix[val] Selects single row of subset of rows from the DataFrame.
obj.ix[:, val] Selects single column of subset of columns.
obj.ix[val1, val2] Select both rows and columns.
reindex method Conform one or more axes to new indexes.
xs method Select single row or column as a Series by label.
icol, irow methods Select single column or row, respectively, as a Series by integer location.
get_value, set_value methods Select single value by row and column label.
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s1 + s2 #assigned NaN for those index which is not found in another series
a 5.2 c 1.1 d NaN e 0.0 f NaN g NaN dtype: float64
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1 + df2
b | c | d | e | |
---|---|---|---|---|
Colorado | NaN | NaN | NaN | NaN |
Ohio | 3.0 | NaN | 6.0 | NaN |
Oregon | NaN | NaN | NaN | NaN |
Texas | 9.0 | NaN | 12.0 | NaN |
Utah | NaN | NaN | NaN | NaN |
df1.add(df2, fill_value=0)
b | c | d | e | |
---|---|---|---|---|
Colorado | 6.0 | 7.0 | 8.0 | NaN |
Ohio | 3.0 | 1.0 | 6.0 | 5.0 |
Oregon | 9.0 | NaN | 10.0 | 11.0 |
Texas | 9.0 | 4.0 | 12.0 | 8.0 |
Utah | 0.0 | NaN | 1.0 | 2.0 |
# when reindexing a Series or DataFrame, you can also specify a different fill value
df1.reindex(columns=df2.columns, fill_value=0)
b | d | e | |
---|---|---|---|
Ohio | 0.0 | 2.0 | 0 |
Texas | 3.0 | 5.0 | 0 |
Colorado | 6.0 | 8.0 | 0 |
add Method for addition (+)
sub Method for subtraction (-)
div Method for division (/)
mul Method for multiplication (*)
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
b | d | e | |
---|---|---|---|
Utah | 0.0 | 1.0 | 2.0 |
Ohio | 3.0 | 4.0 | 5.0 |
Texas | 6.0 | 7.0 | 8.0 |
Oregon | 9.0 | 10.0 | 11.0 |
series = frame.ix[0] # pickng first row
series
b 0.0 d 1.0 e 2.0 Name: Utah, dtype: float64
frame * series
b | d | e | |
---|---|---|---|
Utah | 0.0 | 1.0 | 4.0 |
Ohio | 0.0 | 4.0 | 10.0 |
Texas | 0.0 | 7.0 | 16.0 |
Oregon | 0.0 | 10.0 | 22.0 |
# By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame's columns,
# broadcasting down the rows:
frame - series
b | d | e | |
---|---|---|---|
Utah | 0.0 | 0.0 | 0.0 |
Ohio | 3.0 | 3.0 | 3.0 |
Texas | 6.0 | 6.0 | 6.0 |
Oregon | 9.0 | 9.0 | 9.0 |
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame * series2
b | d | e | f | |
---|---|---|---|---|
Utah | 0.0 | NaN | 2.0 | NaN |
Ohio | 0.0 | NaN | 5.0 | NaN |
Texas | 0.0 | NaN | 8.0 | NaN |
Oregon | 0.0 | NaN | 11.0 | NaN |
f = lambda x : x.max() - x.min()
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
b d e Utah 0.572069 1.094135 0.809313 Ohio -0.205726 0.284662 0.193174 Texas -1.313522 0.970605 -1.067873 Oregon 0.052951 0.225374 -0.274396
frame.apply(f)
b 1.885591 d 0.868761 e 1.877187 dtype: float64
frame.apply(f, axis=1)
Utah 0.522065 Ohio 0.490388 Texas 2.284127 Oregon 0.499770 dtype: float64
# defining a func
def f(x):
return pd.Series([x.max(), x.min()], index=['max', 'min'])
frame.apply(f)
b | d | e | |
---|---|---|---|
max | 0.572069 | 1.094135 | 0.809313 |
min | -1.313522 | 0.225374 | -1.067873 |
frame.apply(f, axis=1)
max | min | |
---|---|---|
Utah | 1.094135 | 0.572069 |
Ohio | 0.284662 | -0.205726 |
Texas | 0.970605 | -1.313522 |
Oregon | 0.225374 | -0.274396 |
format = lambda x: '%.2f' % x
frame.applymap(format)
b | d | e | |
---|---|---|---|
Utah | 0.57 | 1.09 | 0.81 |
Ohio | -0.21 | 0.28 | 0.19 |
Texas | -1.31 | 0.97 | -1.07 |
Oregon | 0.05 | 0.23 | -0.27 |
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj
d 0 a 1 b 2 c 3 dtype: int32
# sorting on index
obj.sort_index()
a 1 b 2 c 3 d 0 dtype: int32
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
frame
d | a | b | c | |
---|---|---|---|---|
three | 0 | 1 | 2 | 3 |
one | 4 | 5 | 6 | 7 |
frame.sort_index()
d | a | b | c | |
---|---|---|---|---|
one | 4 | 5 | 6 | 7 |
three | 0 | 1 | 2 | 3 |
frame.sort_index(axis=1)
a | b | c | d | |
---|---|---|---|---|
three | 1 | 2 | 3 | 0 |
one | 5 | 6 | 7 | 4 |
frame.sort_index(axis=1).sort_index()
a | b | c | d | |
---|---|---|---|---|
one | 5 | 6 | 7 | 4 |
three | 1 | 2 | 3 | 0 |
frame.sort_index(axis=1, ascending=False)
d | c | b | a | |
---|---|---|---|---|
three | 0 | 3 | 2 | 1 |
one | 4 | 7 | 6 | 5 |
# To sort a Series by its values, use its order method
sr = pd.Series(['2', np.nan, '-3', '5'])
sr
0 2 1 NaN 2 -3 3 5 dtype: object
# sorting by value
sr.sort_values()
2 -3 0 2 3 5 1 NaN dtype: object
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
a | b | |
---|---|---|
0 | 0 | 4 |
1 | 1 | 7 |
2 | 0 | -3 |
3 | 1 | 2 |
frame.sort_values(by='b')
a | b | |
---|---|---|
2 | 0 | -3 |
3 | 1 | 2 |
0 | 0 | 4 |
1 | 1 | 7 |
frame.sort_values(by=['a', 'b'])
a | b | |
---|---|---|
2 | 0 | -3 |
0 | 0 | 4 |
3 | 1 | 2 |
1 | 1 | 7 |
# ranking # Explore more
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj
0 7 1 -5 2 7 3 4 4 2 5 0 6 4 dtype: int64
obj.rank()
0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj
a 0 a 1 b 2 b 3 c 4 dtype: int32
obj.index.unique() # get unique index
array(['a', 'b', 'c'], dtype=object)
obj.index.is_unique # check if index are unique
False
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df
0 | 1 | 2 | |
---|---|---|---|
a | -1.077273 | 0.747123 | 0.948973 |
a | 1.097137 | 0.753723 | 0.810827 |
b | -1.413813 | -0.161147 | 0.414814 |
b | -0.633463 | 0.705439 | 0.175328 |
df.index.is_unique
False
df.ix['a'] # ix is used to select rows by index
0 | 1 | 2 | |
---|---|---|---|
a | -1.077273 | 0.747123 | 0.948973 |
a | 1.097137 | 0.753723 | 0.810827 |
df.ix[0]
0 -1.077273 1 0.747123 2 0.948973 Name: a, dtype: float64