import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
%matplotlib inline
print(f"Pandas version: {pd.__version__}, matplotlib version: {matplotlib.__version__}")
Pandas version: 1.3.2, matplotlib version: 3.4.3
df = pd.DataFrame({
'name':['john','mary','peter','jeff','bill','lisa','jose'],
'age':[23,78,22,19,45,33,20],
'gender':['M','F','M','M','M','F','M'],
'state':['california','dc','california','dc','california','texas','texas'],
'num_children':[2,0,0,3,2,1,4],
'num_pets':[5,1,0,5,2,2,3]
})
df[['name','age','gender','state','num_children','num_pets']]
name | age | gender | state | num_children | num_pets | |
---|---|---|---|---|---|---|
0 | john | 23 | M | california | 2 | 5 |
1 | mary | 78 | F | dc | 0 | 1 |
2 | peter | 22 | M | california | 0 | 0 |
3 | jeff | 19 | M | dc | 3 | 5 |
4 | bill | 45 | M | california | 2 | 2 |
5 | lisa | 33 | F | texas | 1 | 2 |
6 | jose | 20 | M | texas | 4 | 3 |
df.plot(kind='scatter',x='num_children',y='num_pets',color='red')
<AxesSubplot:xlabel='num_children', ylabel='num_pets'>
df.plot(kind='bar',x='name',y='age')
<AxesSubplot:xlabel='name'>
plt.clf()
# gca stands for 'get current axis'
ax = plt.gca()
df.plot(kind='line',x='name',y='num_children',ax=ax)
df.plot(kind='line',x='name',y='num_pets', color='red', ax=ax)
plt.show()
df.groupby('state').size().plot(kind='bar')
<AxesSubplot:xlabel='state'>
df.groupby(['state','gender']).size().unstack().plot(kind='bar',stacked=True)
plt.show()
plt.clf()
df.groupby(['gender','state']).size().unstack().plot(kind='bar',stacked=True)
plt.legend(loc='lower right')
plt.gcf().set_size_inches(7,4)
plt.show()
<Figure size 432x288 with 0 Axes>
df[['age']].plot(kind='hist',bins=[0,20,40,60,80,100],rwidth=0.9)
<AxesSubplot:ylabel='Frequency'>
import matplotlib.pyplot as plt
# create dummy variable them group by that
# set the legend to false because we'll fix it later
df.assign(
dummy = 1
).groupby(['dummy','state']).size().to_frame().unstack().plot(kind='bar',stacked=True,legend=False)
plt.title('Number of records by State')
# other it'll show up as 'dummy'
plt.xlabel('state')
# disable ticks in the x axis
plt.xticks([])
# fix the legend
current_handles, _ = plt.gca().get_legend_handles_labels()
reversed_handles = reversed(current_handles)
labels = reversed(df['state'].unique())
plt.legend(reversed_handles,labels,loc='lower right')
plt.gcf().set_size_inches(7,4)
plt.show()
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
# create dummy variable them group by that
# set the legend to false because we'll fix it later
df.assign(
dummy = 1
).groupby(['dummy','state']).size().groupby(level=0).apply(
lambda x: 100 * x / x.sum()
).to_frame().unstack().plot(kind='bar',stacked=True,legend=False)
plt.title('Amount of records by State, normalized')
# other it'll show up as 'dummy'
plt.xlabel('state')
# disable ticks in the x axis
plt.xticks([])
# fix the legend
current_handles, _ = plt.gca().get_legend_handles_labels()
reversed_handles = reversed(current_handles)
labels = reversed(df['state'].unique())
plt.legend(reversed_handles,labels,loc='lower right')
plt.gcf().set_size_inches(7,4)
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
plt.show()
import matplotlib.ticker as mtick
df.groupby(['gender','state']).size().groupby(level=0).apply(
lambda x: 100 * x / x.sum()
).unstack().plot(kind='bar',stacked=True,legend='reverse')
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
plt.title('Amount of records by Gender and State, normalized')
# plt.legend(loc='lower right')
plt.gcf().set_size_inches(7,4)
plt.show()
df = pd.DataFrame({
'name':['john','lisa','peter','carl','linda','betty'],
'date_of_birth':[
'01/21/1988','03/10/1977','07/25/1999','01/22/1977','09/30/1968','09/15/1970'
]
})
df
name | date_of_birth | |
---|---|---|
0 | john | 01/21/1988 |
1 | lisa | 03/10/1977 |
2 | peter | 07/25/1999 |
3 | carl | 01/22/1977 |
4 | linda | 09/30/1968 |
5 | betty | 09/15/1970 |
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], infer_datetime_format=True)
df.dtypes
name object date_of_birth datetime64[ns] dtype: object
plt.clf()
df['date_of_birth'].map(lambda d: d.month).plot(kind='hist')
# plt.xlabel('Month number')
plt.show()
df.set_index(df['date_of_birth']).asfreq('M')
name | date_of_birth | |
---|---|---|
date_of_birth | ||
1968-09-30 | linda | 1968-09-30 |
1968-10-31 | NaN | NaT |
1968-11-30 | NaN | NaT |
1968-12-31 | NaN | NaT |
1969-01-31 | NaN | NaT |
... | ... | ... |
1999-02-28 | NaN | NaT |
1999-03-31 | NaN | NaT |
1999-04-30 | NaN | NaT |
1999-05-31 | NaN | NaT |
1999-06-30 | NaN | NaT |
370 rows × 2 columns