%matplotlib inline
import pandas as pd
names = list(pd.read_csv('adult.names', names=['name', 'desc'], sep=':')[1:]['name']) + ['income']
data = pd.read_csv('adult.data', names=names, index_col=False)
data.head()
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 34 | Private | 80933 | HS-grad | 9 | Married-civ-spouse | Craft-repair | Husband | White | Male | 0 | 1672 | 40 | United-States | <=50K |
1 | 22 | Private | 317019 | Some-college | 10 | Never-married | Other-service | Not-in-family | White | Female | 0 | 0 | 30 | United-States | <=50K |
2 | 42 | Private | 261929 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 7688 | 0 | 40 | United-States | >50K |
3 | 38 | Local-gov | 286405 | Doctorate | 16 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 1887 | 50 | United-States | >50K |
4 | 45 | State-gov | 252208 | HS-grad | 9 | Separated | Adm-clerical | Own-child | White | Female | 0 | 0 | 40 | United-States | <=50K |
data['race'].value_counts().plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x7fe98aac55f8>
data['age'].mean()
38.420666666666669
data['marital-status'].value_counts()[' Never-married']
996
data[data['sex'] == ' Male'].groupby('workclass').mean()['hours-per-week'].plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x7fe98aa8f630>
data[data['sex'] == ' Female'].groupby('occupation').mean()['education-num'].plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x7fe98aa8f898>
data.groupby('income')['age'].mean().plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x7fe98a8c7da0>
rich_sizes = data[data['income'] == ' >50K'].groupby('age').size()
sizes = data.groupby('age').size()
rates = (rich_sizes / sizes).fillna(0)
rates.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fe98a98f9b0>
pd.rolling_mean(rates, window=5).plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fe98a808240>
import matplotlib.pylab as plt
plt.style.use('ggplot')
df = data
dfp = df[df['income']==' <=50K']
dfr = df[df['income']==' >50K']
(dfp.groupby('age').size()/len(dfp)*100).plot(alpha=0.7)
(dfr.groupby('age').size()/len(dfr)*100).plot(alpha=0.7)
plt.title('Distribution of ages in income groups')
plt.xlabel('Age')
plt.ylabel('Percentage')
plt.text(55, 2.2, 'Richer', color='blue', alpha=0.7)
plt.text(25, 3.8, 'Poorer', color='red', alpha=0.7)
<matplotlib.text.Text at 0x7fe98a8eb390>
d1 = {}
for i in range(17,91):
d1[i] = 0
for i in dfp.groupby('age').size().keys():
d1[i] += dfp.groupby('age').size()[i]
d2 = {}
for i in range(17,91):
d2[i] = 0
for i in dfr.groupby('age').size().keys():
d2[i] += dfr.groupby('age').size()[i]
l1 = []
l2 = []
for i in range(17):
l1.append(0)
l2.append(0)
for i in range(17,91):
l1.append(d1[i])
l2.append(d2[i])
d = pd.DataFrame(data={'Poorer': l1, 'Richer': l2})
pd.rolling_mean(d, window=3).plot(kind='area', stacked=True, alpha=0.7)
plt.xlabel('Age')
plt.ylabel('Number of people')
plt.title('Shares of ≤50K and >50K by age')
<matplotlib.text.Text at 0x7fe98a7360f0>