import pandas as pd
df=pd.read_csv('Example.csv',na_values=['not available'])
df.head()
name | class | total marks | |
---|---|---|---|
0 | Mukul | 12 | 454.0 |
1 | Rohan | 12 | 433.0 |
2 | Shivam | 11 | NaN |
3 | Ragav | 11 | NaN |
4 | Monu | 10 | 456.0 |
Sample Python dictionary data and list labels: exam_data = {'name': ['Anastasia', 'Dima', 'Katherine', 'James', 'Emily', 'Michael', 'Matthew', 'Laura', 'Kevin', 'Jonas'], 'score': [12.5, 9, 16.5, np.nan, 9, 20, 14.5, np.nan, 8, 19], 'attempts': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1], 'qualify': ['yes', 'no', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'yes']} labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
dict_data = {'name': list(df.loc[:2,'name']),'class': list(df.loc[:2,'class'])}
dict_data
{'name': ['Mukul', 'Rohan', 'Shivam'], 'class': [12, 12, 11]}
import numpy as np
import pandas as pd
dictx= {'name': ['Anastasia', 'Dima', 'Katherine', 'James', 'Emily', 'Michael', 'Matthew', 'Laura', 'Kevin', 'Jonas'], 'score': [12.5, 9, 16.5, np.nan, 9, 20, 14.5, np.nan, 8, 19], 'attempts': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1], 'qualify': ['yes', 'no', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'yes']}
df2=pd.DataFrame(dictx,index=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])
df2
name | score | attempts | qualify | |
---|---|---|---|---|
a | Anastasia | 12.5 | 1 | yes |
b | Dima | 9.0 | 3 | no |
c | Katherine | 16.5 | 2 | yes |
d | James | NaN | 3 | no |
e | Emily | 9.0 | 2 | no |
f | Michael | 20.0 | 3 | yes |
g | Matthew | 14.5 | 1 | yes |
h | Laura | NaN | 1 | no |
i | Kevin | 8.0 | 2 | no |
j | Jonas | 19.0 | 1 | yes |
df2.iloc[:3]
name | score | attempts | qualify | |
---|---|---|---|---|
a | Anastasia | 12.5 | 1 | yes |
b | Dima | 9.0 | 3 | no |
c | Katherine | 16.5 | 2 | yes |
df2.loc[:,['name','score']]
name | score | |
---|---|---|
a | Anastasia | 12.5 |
b | Dima | 9.0 |
c | Katherine | 16.5 |
d | James | NaN |
e | Emily | 9.0 |
f | Michael | 20.0 |
g | Matthew | 14.5 |
h | Laura | NaN |
i | Kevin | 8.0 |
j | Jonas | 19.0 |
Select 'name' and 'score' columns in rows 1, 3, 5, 6 from the following data frame.
df2.index =[0,1,2,3,4,5,6,7,8,9]
df2.loc[[1,3,5,6],['name','score']]
name | score | |
---|---|---|
1 | Dima | 9.0 |
3 | James | NaN |
5 | Michael | 20.0 |
6 | Matthew | 14.5 |
df2.loc[:,['name','score']]
name | score | |
---|---|---|
0 | Anastasia | 12.5 |
1 | Dima | 9.0 |
2 | Katherine | 16.5 |
3 | James | NaN |
4 | Emily | 9.0 |
5 | Michael | 20.0 |
6 | Matthew | 14.5 |
7 | Laura | NaN |
8 | Kevin | 8.0 |
9 | Jonas | 19.0 |
df2.groupby('name').aggregate({'attempts':max}).reset_index()
name | attempts | |
---|---|---|
0 | Anastasia | 1 |
1 | Dima | 3 |
2 | Emily | 2 |
3 | James | 3 |
4 | Jonas | 1 |
5 | Katherine | 2 |
6 | Kevin | 2 |
7 | Laura | 1 |
8 | Matthew | 1 |
9 | Michael | 3 |
df2.groupby('attempts').max().reset_index().sort_values(by='attempts',ascending=False).head(1)
attempts | name | score | qualify | |
---|---|---|---|---|
2 | 3 | Michael | 20.0 | yes |
df2[df2['attempts']>2]['name']
b Dima d James f Michael Name: name, dtype: object
df2.loc[df2['attempts']>2]
name | score | attempts | qualify | |
---|---|---|---|---|
b | Dima | 9.0 | 3 | no |
d | James | NaN | 3 | no |
f | Michael | 20.0 | 3 | yes |
df2.loc[df2['attempts']>2,['attempts','name']]
attempts | name | |
---|---|---|
b | 3.0 | Dima |
d | 3.0 | James |
f | 3.0 | Michael |
df2.shape[0]
10
df2.loc[df2['score'].isna()==True]
name | score | attempts | qualify | |
---|---|---|---|---|
d | James | NaN | 3.0 | no |
h | Laura | NaN | 1.0 | no |
df2['score'].value_counts()
9.0 2 19.0 1 8.0 1 14.5 1 20.0 1 16.5 1 12.5 1 Name: score, dtype: int64
df2['score'].isna().sum()
2
df2[df2['score'].isna()]
name | score | attempts | qualify | |
---|---|---|---|---|
3 | James | NaN | 3 | no |
7 | Laura | NaN | 1 | no |
df2[df2['score'].between(15,20)]
name | score | attempts | qualify | |
---|---|---|---|---|
2 | Katherine | 16.5 | 2 | yes |
5 | Michael | 20.0 | 3 | yes |
9 | Jonas | 19.0 | 1 | yes |
df2.loc[(df2['attempts']<2) & (df2['score']>15)]
name | score | attempts | qualify | |
---|---|---|---|---|
9 | Jonas | 19.0 | 1 | yes |
df2.loc[4,'score']=11.5
df2
name | score | attempts | qualify | |
---|---|---|---|---|
0 | Anastasia | 12.5 | 1 | yes |
1 | Dima | 9.0 | 3 | no |
2 | Katherine | 16.5 | 2 | yes |
3 | James | NaN | 3 | no |
4 | Emily | 11.5 | 2 | no |
5 | Michael | 20.0 | 3 | yes |
6 | Matthew | 14.5 | 1 | yes |
7 | Laura | NaN | 1 | no |
8 | Kevin | 8.0 | 2 | no |
9 | Jonas | 19.0 | 1 | yes |
To calculate the sum of the examination attempts by the students.
df2['attempts'].aggregate(sum)
19
df2.groupby('name').aggregate({'score':np.mean})
score | |
---|---|
name | |
Anastasia | 12.5 |
Dima | 9.0 |
Emily | 9.0 |
James | NaN |
Jonas | 19.0 |
Katherine | 16.5 |
Kevin | 8.0 |
Laura | NaN |
Matthew | 14.5 |
Michael | 20.0 |
df2.loc['k'] = ['smesh',15,1,'yes']
print(df2)
name score attempts qualify a Anastasia 12.5 1 yes b Dima 9.0 3 no c Katherine 16.5 2 yes d James NaN 3 no e Emily 9.0 2 no f Michael 20.0 3 yes g Matthew 14.5 1 yes h Laura NaN 1 no i Kevin 8.0 2 no j Jonas 19.0 1 yes k smesh 15.0 1 yes
df2.loc['k']
df2.drop('k')
name | score | attempts | qualify | |
---|---|---|---|---|
a | Anastasia | 12.5 | 1 | yes |
b | Dima | 9.0 | 3 | no |
c | Katherine | 16.5 | 2 | yes |
d | James | NaN | 3 | no |
e | Emily | 9.0 | 2 | no |
f | Michael | 20.0 | 3 | yes |
g | Matthew | 14.5 | 1 | yes |
h | Laura | NaN | 1 | no |
i | Kevin | 8.0 | 2 | no |
j | Jonas | 19.0 | 1 | yes |
df2['extra']=np.arange(11)
df2
name | score | attempts | qualify | extra | |
---|---|---|---|---|---|
0 | Anastasia | 12.5 | 1 | yes | 0 |
1 | Dima | 9.0 | 3 | no | 1 |
2 | Katherine | 16.5 | 2 | yes | 2 |
3 | James | NaN | 3 | no | 3 |
4 | Emily | 11.5 | 2 | no | 4 |
5 | Michael | 20.0 | 3 | yes | 5 |
6 | Matthew | 14.5 | 1 | yes | 6 |
7 | Laura | NaN | 1 | no | 7 |
8 | Kevin | 8.0 | 2 | no | 8 |
9 | Jonas | 19.0 | 1 | yes | 9 |
k | smesh | 15.0 | 1 | yes | 10 |
df2.drop('extra',axis=1,inplace=True)
df2.sort_values(by=['name','score'],ascending=[True,False])
name | score | attempts | qualify | |
---|---|---|---|---|
0 | Anastasia | 12.5 | 1 | yes |
1 | Dima | 9.0 | 3 | no |
4 | Emily | 11.5 | 2 | no |
3 | James | NaN | 3 | no |
9 | Jonas | 19.0 | 1 | yes |
2 | Katherine | 16.5 | 2 | yes |
8 | Kevin | 8.0 | 2 | no |
7 | Laura | NaN | 1 | no |
6 | Matthew | 14.5 | 1 | yes |
5 | Michael | 20.0 | 3 | yes |
k | smesh | 15.0 | 1 | yes |
df3=df2.replace({'yes':True,'no':False})
df3
name | score | attempts | qualify | |
---|---|---|---|---|
0 | Anastasia | 12.5 | 1 | True |
1 | Dima | 9.0 | 3 | False |
2 | Katherine | 16.5 | 2 | True |
3 | James | NaN | 3 | False |
4 | Emily | 11.5 | 2 | False |
5 | Michael | 20.0 | 3 | True |
6 | Matthew | 14.5 | 1 | True |
7 | Laura | NaN | 1 | False |
8 | Kevin | 8.0 | 2 | False |
9 | Jonas | 19.0 | 1 | True |
k | smesh | 15.0 | 1 | True |
df3.loc[df3['name']=='James','name']='Suresh'
df2[df2['name']=='James']['name'] ='Suresh'
<ipython-input-31-365e2a0b86da>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df2[df2['name']=='James']['name'] ='Suresh'
df3
name | score | attempts | qualify | |
---|---|---|---|---|
0 | Anastasia | 12.5 | 1 | True |
1 | Dima | 9.0 | 3 | False |
2 | Katherine | 16.5 | 2 | True |
3 | Suresh | NaN | 3 | False |
4 | Emily | 11.5 | 2 | False |
5 | Michael | 20.0 | 3 | True |
6 | Matthew | 14.5 | 1 | True |
7 | Laura | NaN | 1 | False |
8 | Kevin | 8.0 | 2 | False |
9 | Jonas | 19.0 | 1 | True |
k | smesh | 15.0 | 1 | True |
df3=df3.drop('attempts',axis=1)
df3['color']=np.nan
Bonus
df3.loc[[True,False,False,False,False,False,False,False,False,False,False]]
name | score | qualify | color | |
---|---|---|---|---|
0 | Anastasia | 12.5 | True | NaN |
Iterating through DataFrame
for key,value in df.iteritems():
print('key',key)
print('value',value)
key name value 0 Mukul 1 Rohan 2 Shivam 3 Ragav 4 Monu Name: name, dtype: object key class value 0 12 1 12 2 11 3 11 4 10 Name: class, dtype: int64 key total marks value 0 454.0 1 433.0 2 NaN 3 NaN 4 456.0 Name: total marks, dtype: float64
for key,value in df.iterrows():
print('key',key)
print('value',value)
key 0 value name Mukul class 12 total marks 454 Name: 0, dtype: object key 1 value name Rohan class 12 total marks 433 Name: 1, dtype: object key 2 value name Shivam class 11 total marks NaN Name: 2, dtype: object key 3 value name Ragav class 11 total marks NaN Name: 3, dtype: object key 4 value name Monu class 10 total marks 456 Name: 4, dtype: object
for value in df.itertuples():
print('value',value)
value Pandas(Index=0, name='Mukul', _2=12, _3=454.0) value Pandas(Index=1, name='Rohan', _2=12, _3=433.0) value Pandas(Index=2, name='Shivam', _2=11, _3=nan) value Pandas(Index=3, name='Ragav', _2=11, _3=nan) value Pandas(Index=4, name='Monu', _2=10, _3=456.0)
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(df.corr(),cmap='GnBu',annot=np.array([['',''],list(df.corr()['total marks'])]),fmt = 's',linewidths=10,linecolor='r')
plt.show()
df.corr()
class | total marks | |
---|---|---|
class | 1.000000 | -0.566429 |
total marks | -0.566429 | 1.000000 |
df.corr()['total marks']
class -0.566429 total marks 1.000000 Name: total marks, dtype: float64
df.corr().loc['class',['total marks']]
total marks -0.566429 Name: class, dtype: float64