import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
pd.Series(data, index, name)
# Series
# s = pd.Series(data, index = index)
s = pd.Series(np.random.randn(5), index = ['a','b','c','d','e'])
s
a -0.616345 b -1.020763 c -1.874393 d 0.835360 e -0.330025 dtype: float64
# To get the index
s.index
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
duplicate_index = pd.Series(np.random.randn(4), index = ['a', 'a', 'c', 'd'])
duplicate_index
a -1.143204 a -0.623465 c 0.655221 d 1.632808 dtype: float64
duplicate_index['a']
a -1.143204 a -0.623465 dtype: float64
# can pass dict to a series where keys of a dict become index of the series
# Series same as ndarray so it can be passed as argument to most Numpy functions
# Slicing also works for the index of Series
# Index can also be represented as standard numbers
s[0]
-0.61634508374071517
s[0:3]
a -0.616345 b -1.020763 c -1.874393 dtype: float64
s
a -0.616345 b -1.020763 c -1.874393 d 0.835360 e -0.330025 dtype: float64
# Series is also a fixed size dict
# So all operations of dict also work for Series
# You can also explicitly add new indexes
s['f'] = 10
s
a -0.616345 b -1.020763 c -1.874393 d 0.835360 e -0.330025 f 10.000000 dtype: float64
# Name
s = pd.Series(np.random.randn(6), name='Something')
s
0 -0.086604 1 -0.848126 2 0.600843 3 -0.056080 4 -0.412875 5 -1.192173 Name: Something, dtype: float64
s.name
'Something'
df = pd.DataFrame({'one':pd.Series([1.,2.,3.], index=['a', 'b', 'c']),
'two':pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])})
df
one | two | |
---|---|---|
a | 1.0 | 1.0 |
b | 2.0 | 2.0 |
c | 3.0 | 3.0 |
d | NaN | 4.0 |
df.index
Index(['a', 'b', 'c', 'd'], dtype='object')
df.columns
Index(['one', 'two'], dtype='object')
# DataFrame does not function in the same way as numpy 2-D array
# Treat DataFrame as a dict of like-indexed Series objects
df['one']
a 1.0 b 2.0 c 3.0 d NaN Name: one, dtype: float64
# You first select the Series and then the row you want
df['one']['a']
1.0
df['three'] = df['one'] + df['two']
df
one | two | three | |
---|---|---|---|
a | 1.0 | 1.0 | 2.0 |
b | 2.0 | 2.0 | 4.0 |
c | 3.0 | 3.0 | 6.0 |
d | NaN | 4.0 | NaN |
df['flag'] = df['one'] > 2
df
one | two | three | flag | |
---|---|---|---|---|
a | 1.0 | 1.0 | 2.0 | False |
b | 2.0 | 2.0 | 4.0 | False |
c | 3.0 | 3.0 | 6.0 | True |
d | NaN | 4.0 | NaN | False |
del df['flag']
df
one | two | three | |
---|---|---|---|
a | 1.0 | 1.0 | 2.0 |
b | 2.0 | 2.0 | 4.0 |
c | 3.0 | 3.0 | 6.0 |
d | NaN | 4.0 | NaN |
three = df.pop('three')
three
a 2.0 b 4.0 c 6.0 d NaN Name: three, dtype: float64
# You can almost do all operations of a dictionary assuming that
# keys are columns names and the values are the complete column
# Remember df['name'] selects then 'name' Series in the df
# Now it can be treated as a simple ndarray.
# To access it's the element of df['name'] simply use df['name']['index']
df
one | two | |
---|---|---|
a | 1.0 | 1.0 |
b | 2.0 | 2.0 |
c | 3.0 | 3.0 |
d | NaN | 4.0 |
df['one'][0]
1.0
# assign
# To create a new column from a combination of the existing columns
df.assign(ratio = df['one']/df['two'])
one | two | ratio | |
---|---|---|---|
a | 1.0 | 1.0 | 1.0 |
b | 2.0 | 2.0 | 1.0 |
c | 3.0 | 3.0 | 1.0 |
d | NaN | 4.0 | NaN |
df.assign(check = df['one'] > 2)
one | two | check | |
---|---|---|---|
a | 1.0 | 1.0 | False |
b | 2.0 | 2.0 | False |
c | 3.0 | 3.0 | True |
d | NaN | 4.0 | False |
# If you want to do condition of something while assigning
df.query('two > 2').assign(eg1 = lambda x: x.two)
# You can also use this syntax if you want some a colum filled with
# values meeting some required conditon
one | two | eg1 | |
---|---|---|---|
c | 3.0 | 3.0 | 3.0 |
d | NaN | 4.0 | 4.0 |
# To apply multiple conditions
df.query('two > 2 & one != 3').assign(eg2 = lambda x : x.two)
one | two | eg2 | |
---|---|---|---|
d | NaN | 4.0 | 4.0 |
# Indexing rules
# df['col_name'] = Return the complete Series i.e. column
# df.loc['label'] = Returns the entire row in the form of a Series
# df.iloc[label_in_integer_locations] = same as above
# df[5:10] = Slice the rows
# df(bool_vec) = Select rows by boolen vector
df
one | two | |
---|---|---|
a | 1.0 | 1.0 |
b | 2.0 | 2.0 |
c | 3.0 | 3.0 |
d | NaN | 4.0 |
df.loc['b']
one 2.0 two 2.0 Name: b, dtype: float64
df.iloc[1]
one 2.0 two 2.0 Name: b, dtype: float64
# Alignment and Arithematic
df1 = pd.DataFrame(np.random.randn(10, 4), columns = ['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7 , 3), columns = ['A', 'B', 'C'])
df1
A | B | C | D | |
---|---|---|---|---|
0 | 0.588785 | 1.198493 | -1.631852 | 1.474286 |
1 | -0.862017 | -0.546645 | -0.163934 | 0.278578 |
2 | -1.357200 | 1.392881 | 1.134246 | -0.362245 |
3 | 0.597219 | -1.041849 | 0.127411 | -0.227187 |
4 | 1.636223 | -1.049857 | -0.021015 | 0.539049 |
5 | 0.301174 | -1.576095 | 0.987439 | 1.429394 |
6 | -0.300917 | 2.874198 | -0.596179 | 1.625892 |
7 | -0.396302 | -0.996899 | -0.095010 | 0.259675 |
8 | 1.331846 | -0.683059 | 0.889802 | 1.492522 |
9 | 0.709617 | 0.145721 | 2.222681 | -1.033920 |
df2
A | B | C | |
---|---|---|---|
0 | 1.762065 | 2.699884 | -0.310927 |
1 | -0.100875 | -0.508357 | 1.682844 |
2 | 0.595381 | 0.081955 | 0.069323 |
3 | -1.445428 | 0.423870 | 0.088223 |
4 | -0.923425 | 0.081247 | -1.525817 |
5 | 0.098620 | 2.393215 | 0.446321 |
6 | 0.037939 | -0.149339 | 2.192394 |
# To add the two dataframes
# The missing labels and columns would be treated as Nan
df1+df2
A | B | C | D | |
---|---|---|---|---|
0 | 2.350849 | 3.898377 | -1.942780 | NaN |
1 | -0.962892 | -1.055002 | 1.518910 | NaN |
2 | -0.761819 | 1.474836 | 1.203569 | NaN |
3 | -0.848209 | -0.617979 | 0.215635 | NaN |
4 | 0.712798 | -0.968610 | -1.546831 | NaN |
5 | 0.399794 | 0.817121 | 1.433760 | NaN |
6 | -0.262978 | 2.724859 | 1.596215 | NaN |
7 | NaN | NaN | NaN | NaN |
8 | NaN | NaN | NaN | NaN |
9 | NaN | NaN | NaN | NaN |
# When operating among Series and DatFrame align the index
# of the Series with columns of the DatFrame
(df1+df2)*2
A | B | C | D | |
---|---|---|---|---|
0 | 4.701699 | 7.796754 | -3.885559 | NaN |
1 | -1.925784 | -2.110004 | 3.037820 | NaN |
2 | -1.523638 | 2.949672 | 2.407138 | NaN |
3 | -1.696418 | -1.235958 | 0.431270 | NaN |
4 | 1.425596 | -1.937219 | -3.093663 | NaN |
5 | 0.799588 | 1.634241 | 2.867520 | NaN |
6 | -0.525956 | 5.449719 | 3.192430 | NaN |
7 | NaN | NaN | NaN | NaN |
8 | NaN | NaN | NaN | NaN |
9 | NaN | NaN | NaN | NaN |
# You can explicitly change the type of all elements of a DatFrame
df = pd.DataFrame({'a':[1,0,1], 'b':[0,1,1]}, dtype=np.float32)
df
a | b | |
---|---|---|
0 | 1.0 | 0.0 |
1 | 0.0 | 1.0 |
2 | 1.0 | 1.0 |
df.dtypes
a float32 b float32 dtype: object
df.T
0 | 1 | 2 | |
---|---|---|---|
a | 1.0 | 0.0 | 1.0 |
b | 0.0 | 1.0 | 1.0 |
# All arithmematic functions of numpy can be applied to DataFrames
# given that all elements are numeric
df = pd.DataFrame({'one':[np.NaN, 1, 2]})
print(df)
# Due to the latest release of v.22 operations on NaN simply ignore
# NaN and return the result of operations as NaN rather than giving
# error as in earlier version
np.log(df)
one 0 NaN 1 1.0 2 2.0
one | |
---|---|
0 | NaN |
1 | 0.000000 |
2 | 0.693147 |
# Use info() to get all information about a DatFrame
df = pd.DataFrame({'one':pd.Series([1.,2.,3.], index=['a', 'b', 'c']),
'two':pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])})
df
one | two | |
---|---|---|
a | 1.0 | 1.0 |
b | 2.0 | 2.0 |
c | 3.0 | 3.0 |
d | NaN | 4.0 |
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 4 entries, a to d Data columns (total 2 columns): one 3 non-null float64 two 4 non-null float64 dtypes: float64(2) memory usage: 96.0+ bytes
# How much to print on each line
pd.set_option('display.width', 40)
# Max-width of individual columns
pd.set_option('display.max_colwidth', 30)
# For making 3-D arrays
# Axis names
# 1) items -> axis =0
# 2) major_axis -> axis = 1(index)
# 3) minor_axis -> axis = 2(columns)
wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
major_axis = pd.date_range('9/1/2018', periods = 5),
minor_axis = ['A', 'B', 'C', 'D'])
wp
<class 'pandas.core.panel.Panel'> Dimensions: 2 (items) x 5 (major_axis) x 4 (minor_axis) Items axis: Item1 to Item2 Major_axis axis: 2018-09-01 00:00:00 to 2018-09-05 00:00:00 Minor_axis axis: A to D
# Creating from dict
data = {'Item1':pd.DataFrame(np.random.randn(4,3)),
'Item2':pd.DataFrame(np.random.randn(4,2))}
wp = pd.Panel(data)
wp
<class 'pandas.core.panel.Panel'> Dimensions: 2 (items) x 4 (major_axis) x 3 (minor_axis) Items axis: Item1 to Item2 Major_axis axis: 0 to 3 Minor_axis axis: 0 to 2