In [ ]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

Series¶

pd.Series(data, index, name)

In [ ]:

# Series
# s = pd.Series(data, index = index)
s = pd.Series(np.random.randn(5), index = ['a','b','c','d','e'])
s

Out[ ]:

a   -0.616345
b   -1.020763
c   -1.874393
d    0.835360
e   -0.330025
dtype: float64

In [ ]:

# To get the index
s.index

Out[ ]:

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [ ]:

duplicate_index = pd.Series(np.random.randn(4), index = ['a', 'a', 'c', 'd'])
duplicate_index

Out[ ]:

a   -1.143204
a   -0.623465
c    0.655221
d    1.632808
dtype: float64

In [ ]:

duplicate_index['a']

Out[ ]:

a   -1.143204
a   -0.623465
dtype: float64

In [ ]:

# can pass dict to a series where keys of a dict become index of the series

# Series same as ndarray so it can be passed as argument to most Numpy functions
# Slicing also works for the index of Series

# Index can also be represented as standard numbers
s[0]

Out[ ]:

-0.61634508374071517

In [ ]:

s[0:3]

Out[ ]:

a   -0.616345
b   -1.020763
c   -1.874393
dtype: float64

In [ ]:

Out[ ]:

a   -0.616345
b   -1.020763
c   -1.874393
d    0.835360
e   -0.330025
dtype: float64

In [ ]:

# Series is also a fixed size dict
# So all operations of dict also work for Series

# You can also explicitly add new indexes
s['f'] = 10

In [ ]:

Out[ ]:

a    -0.616345
b    -1.020763
c    -1.874393
d     0.835360
e    -0.330025
f    10.000000
dtype: float64

In [ ]:

# Name
s = pd.Series(np.random.randn(6), name='Something')
s

Out[ ]:

0   -0.086604
1   -0.848126
2    0.600843
3   -0.056080
4   -0.412875
5   -1.192173
Name: Something, dtype: float64

In [ ]:

s.name

Out[ ]:

'Something'

DataFrame¶

In [ ]:

df = pd.DataFrame({'one':pd.Series([1.,2.,3.], index=['a', 'b', 'c']),
                  'two':pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])})
df

Out[ ]:

	one	two
a	1.0	1.0
b	2.0	2.0
c	3.0	3.0
d	NaN	4.0

In [ ]:

df.index

Out[ ]:

Index(['a', 'b', 'c', 'd'], dtype='object')

In [ ]:

df.columns

Out[ ]:

Index(['one', 'two'], dtype='object')

In [ ]:

# DataFrame does not function in the same way as numpy 2-D array

# Treat DataFrame as a dict of like-indexed Series objects

In [ ]:

df['one']

Out[ ]:

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [ ]:

# You first select the Series and then the row you want
df['one']['a']

Out[ ]:

1.0

In [ ]:

df['three'] = df['one'] + df['two']
df

Out[ ]:

	one	two	three
a	1.0	1.0	2.0
b	2.0	2.0	4.0
c	3.0	3.0	6.0
d	NaN	4.0	NaN

In [ ]:

df['flag'] = df['one']  > 2
df

Out[ ]:

	one	two	three	flag
a	1.0	1.0	2.0	False
b	2.0	2.0	4.0	False
c	3.0	3.0	6.0	True
d	NaN	4.0	NaN	False

In [ ]:

del df['flag']
df

Out[ ]:

	one	two	three
a	1.0	1.0	2.0
b	2.0	2.0	4.0
c	3.0	3.0	6.0
d	NaN	4.0	NaN

In [ ]:

three = df.pop('three')
three

Out[ ]:

a    2.0
b    4.0
c    6.0
d    NaN
Name: three, dtype: float64

In [ ]:

# You can almost do all operations of a dictionary assuming that
# keys are columns names and the values are the complete column

# Remember df['name'] selects then 'name' Series in the df
# Now it can be treated as a simple ndarray.
# To access it's the element of df['name'] simply use df['name']['index']

In [ ]:

df

Out[ ]:

	one	two
a	1.0	1.0
b	2.0	2.0
c	3.0	3.0
d	NaN	4.0

In [ ]:

df['one'][0]

Out[ ]:

1.0

In [ ]:

# assign
# To create a new column from a combination of the existing columns
df.assign(ratio = df['one']/df['two'])

Out[ ]:

	one	two	ratio
a	1.0	1.0	1.0
b	2.0	2.0	1.0
c	3.0	3.0	1.0
d	NaN	4.0	NaN

In [ ]:

df.assign(check = df['one'] > 2)

Out[ ]:

	one	two	check
a	1.0	1.0	False
b	2.0	2.0	False
c	3.0	3.0	True
d	NaN	4.0	False

In [ ]:

# If you want to do condition of something while assigning
df.query('two > 2').assign(eg1 = lambda x: x.two)

# You can also use this syntax if you want some a colum filled with
# values meeting some required conditon

Out[ ]:

	one	two	eg1
c	3.0	3.0	3.0
d	NaN	4.0	4.0

In [ ]:

# To apply multiple conditions

df.query('two > 2 & one != 3').assign(eg2 = lambda x : x.two)

Out[ ]:

	one	two	eg2
d	NaN	4.0	4.0

In [ ]:

# Indexing rules

# df['col_name'] = Return the complete Series i.e. column
# df.loc['label'] = Returns the entire row in the form of a Series
# df.iloc[label_in_integer_locations] = same as above
# df[5:10] = Slice the rows
# df(bool_vec) = Select rows by boolen vector

In [ ]:

df

Out[ ]:

	one	two
a	1.0	1.0
b	2.0	2.0
c	3.0	3.0
d	NaN	4.0

In [ ]:

df.loc['b']

Out[ ]:

one    2.0
two    2.0
Name: b, dtype: float64

In [ ]:

df.iloc[1]

Out[ ]:

one    2.0
two    2.0
Name: b, dtype: float64

In [ ]:

# Alignment and Arithematic
df1 = pd.DataFrame(np.random.randn(10, 4), columns = ['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7 , 3), columns = ['A', 'B', 'C'])

In [ ]:

df1

Out[ ]:

	A	B	C	D
0	0.588785	1.198493	-1.631852	1.474286
1	-0.862017	-0.546645	-0.163934	0.278578
2	-1.357200	1.392881	1.134246	-0.362245
3	0.597219	-1.041849	0.127411	-0.227187
4	1.636223	-1.049857	-0.021015	0.539049
5	0.301174	-1.576095	0.987439	1.429394
6	-0.300917	2.874198	-0.596179	1.625892
7	-0.396302	-0.996899	-0.095010	0.259675
8	1.331846	-0.683059	0.889802	1.492522
9	0.709617	0.145721	2.222681	-1.033920

In [ ]:

df2

Out[ ]:

	A	B	C
0	1.762065	2.699884	-0.310927
1	-0.100875	-0.508357	1.682844
2	0.595381	0.081955	0.069323
3	-1.445428	0.423870	0.088223
4	-0.923425	0.081247	-1.525817
5	0.098620	2.393215	0.446321
6	0.037939	-0.149339	2.192394

In [ ]:

# To add the two dataframes
# The missing labels and columns would be treated as Nan
df1+df2

Out[ ]:

	A	B	C	D
0	2.350849	3.898377	-1.942780	NaN
1	-0.962892	-1.055002	1.518910	NaN
2	-0.761819	1.474836	1.203569	NaN
3	-0.848209	-0.617979	0.215635	NaN
4	0.712798	-0.968610	-1.546831	NaN
5	0.399794	0.817121	1.433760	NaN
6	-0.262978	2.724859	1.596215	NaN
7	NaN	NaN	NaN	NaN
8	NaN	NaN	NaN	NaN
9	NaN	NaN	NaN	NaN

In [ ]:

# When operating among Series and DatFrame align the index 
# of the Series with columns of the DatFrame

(df1+df2)*2

Out[ ]:

	A	B	C	D
0	4.701699	7.796754	-3.885559	NaN
1	-1.925784	-2.110004	3.037820	NaN
2	-1.523638	2.949672	2.407138	NaN
3	-1.696418	-1.235958	0.431270	NaN
4	1.425596	-1.937219	-3.093663	NaN
5	0.799588	1.634241	2.867520	NaN
6	-0.525956	5.449719	3.192430	NaN
7	NaN	NaN	NaN	NaN
8	NaN	NaN	NaN	NaN
9	NaN	NaN	NaN	NaN

In [ ]:

# You can explicitly change the type of all elements of a DatFrame
df = pd.DataFrame({'a':[1,0,1], 'b':[0,1,1]}, dtype=np.float32)
df

Out[ ]:

	a	b
0	1.0	0.0
1	0.0	1.0
2	1.0	1.0

In [ ]:

df.dtypes

Out[ ]:

a    float32
b    float32
dtype: object

In [ ]:

df.T

Out[ ]:

	0	1	2
a	1.0	0.0	1.0
b	0.0	1.0	1.0

In [ ]:

# All arithmematic functions of numpy can be applied to DataFrames
# given that all elements are numeric
df = pd.DataFrame({'one':[np.NaN, 1, 2]})
print(df)

# Due to the latest release of v.22 operations on NaN simply ignore
# NaN and return the result of operations as NaN rather than giving 
# error as in earlier version
np.log(df)

   one
0  NaN
1  1.0
2  2.0

Out[ ]:

	one
0	NaN
1	0.000000
2	0.693147

Console Display¶

In [ ]:

# Use info() to get all information about a DatFrame
df = pd.DataFrame({'one':pd.Series([1.,2.,3.], index=['a', 'b', 'c']),
                  'two':pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])})
df

Out[ ]:

	one	two
a	1.0	1.0
b	2.0	2.0
c	3.0	3.0
d	NaN	4.0

In [ ]:

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 2 columns):
one    3 non-null float64
two    4 non-null float64
dtypes: float64(2)
memory usage: 96.0+ bytes

In [ ]:

# How much to print on each line
pd.set_option('display.width', 40)

# Max-width of individual columns
pd.set_option('display.max_colwidth', 30)

Panel¶

In [ ]:

# For making 3-D arrays 
# Axis names
# 1) items -> axis =0
# 2) major_axis -> axis = 1(index)
# 3) minor_axis -> axis = 2(columns)

In [ ]:

wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], 
             major_axis = pd.date_range('9/1/2018', periods = 5),
             minor_axis = ['A', 'B', 'C', 'D'])

wp

Out[ ]:

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 5 (major_axis) x 4 (minor_axis)
Items axis: Item1 to Item2
Major_axis axis: 2018-09-01 00:00:00 to 2018-09-05 00:00:00
Minor_axis axis: A to D

In [ ]:

# Creating from dict
data = {'Item1':pd.DataFrame(np.random.randn(4,3)),
              'Item2':pd.DataFrame(np.random.randn(4,2))}
wp = pd.Panel(data)

wp

Out[ ]:

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 4 (major_axis) x 3 (minor_axis)
Items axis: Item1 to Item2
Major_axis axis: 0 to 3
Minor_axis axis: 0 to 2