In [1]:
import pandas as pd #importing packages
import os as os
In [2]:
#pd.describe_option() #describe options for customizing
In [3]:
#pd.get_option("display.memory_usage")#setting some options
In [4]:
os.getcwd() #current working directory
Out[4]:
'/home/ajay'
In [5]:
os.chdir('/home/ajay/Desktop')
In [6]:
os.getcwd()
Out[6]:
'/home/ajay/Desktop'
In [7]:
a=os.getcwd()
os.listdir(a)
Out[7]:
['adult.data']
In [8]:
names2=["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","income"]
In [9]:
len(names2)
Out[9]:
15
In [10]:
adult=pd.read_csv("adult.data",header=None)
In [11]:
len(adult)
Out[11]:
32562
In [12]:
adult.columns
Out[12]:
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='int64')
In [13]:
adult.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 32562 entries, 0 to 32561
Data columns (total 15 columns):
0     32561 non-null float64
1     32561 non-null object
2     32561 non-null float64
3     32561 non-null object
4     32561 non-null float64
5     32561 non-null object
6     32561 non-null object
7     32561 non-null object
8     32561 non-null object
9     32561 non-null object
10    32561 non-null float64
11    32561 non-null float64
12    32561 non-null float64
13    32561 non-null object
14    32561 non-null object
dtypes: float64(6), object(9)
In [14]:
adult.head(8)
Out[14]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
5 37 Private 284582 Masters 14 Married-civ-spouse Exec-managerial Wife White Female 0 0 40 United-States <=50K
6 49 Private 160187 9th 5 Married-spouse-absent Other-service Not-in-family Black Female 0 0 16 Jamaica <=50K
7 52 Self-emp-not-inc 209642 HS-grad 9 Married-civ-spouse Exec-managerial Husband White Male 0 0 45 United-States >50K
In [15]:
adult.columns= names2
In [16]:
adult.head(30)
Out[16]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
5 37 Private 284582 Masters 14 Married-civ-spouse Exec-managerial Wife White Female 0 0 40 United-States <=50K
6 49 Private 160187 9th 5 Married-spouse-absent Other-service Not-in-family Black Female 0 0 16 Jamaica <=50K
7 52 Self-emp-not-inc 209642 HS-grad 9 Married-civ-spouse Exec-managerial Husband White Male 0 0 45 United-States >50K
8 31 Private 45781 Masters 14 Never-married Prof-specialty Not-in-family White Female 14084 0 50 United-States >50K
9 42 Private 159449 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 5178 0 40 United-States >50K
10 37 Private 280464 Some-college 10 Married-civ-spouse Exec-managerial Husband Black Male 0 0 80 United-States >50K
11 30 State-gov 141297 Bachelors 13 Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander Male 0 0 40 India >50K
12 23 Private 122272 Bachelors 13 Never-married Adm-clerical Own-child White Female 0 0 30 United-States <=50K
13 32 Private 205019 Assoc-acdm 12 Never-married Sales Not-in-family Black Male 0 0 50 United-States <=50K
14 40 Private 121772 Assoc-voc 11 Married-civ-spouse Craft-repair Husband Asian-Pac-Islander Male 0 0 40 ? >50K
15 34 Private 245487 7th-8th 4 Married-civ-spouse Transport-moving Husband Amer-Indian-Eskimo Male 0 0 45 Mexico <=50K
16 25 Self-emp-not-inc 176756 HS-grad 9 Never-married Farming-fishing Own-child White Male 0 0 35 United-States <=50K
17 32 Private 186824 HS-grad 9 Never-married Machine-op-inspct Unmarried White Male 0 0 40 United-States <=50K
18 38 Private 28887 11th 7 Married-civ-spouse Sales Husband White Male 0 0 50 United-States <=50K
19 43 Self-emp-not-inc 292175 Masters 14 Divorced Exec-managerial Unmarried White Female 0 0 45 United-States >50K
20 40 Private 193524 Doctorate 16 Married-civ-spouse Prof-specialty Husband White Male 0 0 60 United-States >50K
21 54 Private 302146 HS-grad 9 Separated Other-service Unmarried Black Female 0 0 20 United-States <=50K
22 35 Federal-gov 76845 9th 5 Married-civ-spouse Farming-fishing Husband Black Male 0 0 40 United-States <=50K
23 43 Private 117037 11th 7 Married-civ-spouse Transport-moving Husband White Male 0 2042 40 United-States <=50K
24 59 Private 109015 HS-grad 9 Divorced Tech-support Unmarried White Female 0 0 40 United-States <=50K
25 56 Local-gov 216851 Bachelors 13 Married-civ-spouse Tech-support Husband White Male 0 0 40 United-States >50K
26 19 Private 168294 HS-grad 9 Never-married Craft-repair Own-child White Male 0 0 40 United-States <=50K
27 54 ? 180211 Some-college 10 Married-civ-spouse ? Husband Asian-Pac-Islander Male 0 0 60 South >50K
28 39 Private 367260 HS-grad 9 Divorced Exec-managerial Not-in-family White Male 0 0 80 United-States <=50K
29 49 Private 193366 HS-grad 9 Married-civ-spouse Craft-repair Husband White Male 0 0 40 United-States <=50K
In [17]:
adult.describe() #numerical summaries
Out[17]:
age fnlwgt education-num capital-gain capital-loss hours-per-week
count 32561.000000 32561.000000 32561.000000 32561.000000 32561.000000 32561.000000
mean 38.581647 189778.366512 10.080679 1077.648844 87.303830 40.437456
std 13.640433 105549.977697 2.572720 7385.292085 402.960219 12.347429
min 17.000000 12285.000000 1.000000 0.000000 0.000000 1.000000
25% 28.000000 117827.000000 9.000000 0.000000 0.000000 40.000000
50% 37.000000 178356.000000 10.000000 0.000000 0.000000 40.000000
75% 48.000000 237051.000000 12.000000 0.000000 0.000000 45.000000
max 90.000000 1484705.000000 16.000000 99999.000000 4356.000000 99.000000
In [18]:
workclass=adult.groupby("workclass")
In [19]:
len(workclass)
Out[19]:
9
In [20]:
workclass.sum()
Out[20]:
age fnlwgt education-num capital-gain capital-loss hours-per-week
workclass
? 75203 346115997 17002 1114077 111556 58604
Federal-gov 40887 177812394 10535 799903 107778 39724
Local-gov 87385 394822919 23111 1842264 229925 85777
Never-worked 144 1581927 52 0 0 199
Private 835158 4374974348 224230 20181687 1815878 913902
Self-emp-inc 51355 196395180 12429 5441274 173135 54481
Self-emp-not-inc 114268 446221558 25985 4792483 296361 112876
State-gov 51188 239009324 14766 910806 108067 50663
Without-pay 669 2439745 127 6830 0 458
In [21]:
workclass.count()
Out[21]:
age fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
workclass
? 1836 1836 1836 1836 1836 1836 1836 1836 1836 1836 1836 1836 1836 1836
Federal-gov 960 960 960 960 960 960 960 960 960 960 960 960 960 960
Local-gov 2093 2093 2093 2093 2093 2093 2093 2093 2093 2093 2093 2093 2093 2093
Never-worked 7 7 7 7 7 7 7 7 7 7 7 7 7 7
Private 22696 22696 22696 22696 22696 22696 22696 22696 22696 22696 22696 22696 22696 22696
Self-emp-inc 1116 1116 1116 1116 1116 1116 1116 1116 1116 1116 1116 1116 1116 1116
Self-emp-not-inc 2541 2541 2541 2541 2541 2541 2541 2541 2541 2541 2541 2541 2541 2541
State-gov 1298 1298 1298 1298 1298 1298 1298 1298 1298 1298 1298 1298 1298 1298
Without-pay 14 14 14 14 14 14 14 14 14 14 14 14 14 14
In [22]:
workclass.describe()
Out[22]:
age capital-gain capital-loss education-num fnlwgt hours-per-week
workclass
? count 1836.000000 1836.000000 1836.000000 1836.000000 1836.000000 1836.000000
mean 40.960240 606.795752 60.760349 9.260349 188516.338235 31.919390
std 20.334587 5147.323872 354.685264 2.601986 107089.902252 14.909903
min 17.000000 0.000000 0.000000 1.000000 12285.000000 1.000000
25% 21.000000 0.000000 0.000000 9.000000 117771.250000 20.000000
50% 35.000000 0.000000 0.000000 9.000000 175617.000000 36.000000
75% 61.000000 0.000000 0.000000 10.000000 234568.500000 40.000000
max 90.000000 99999.000000 4356.000000 16.000000 981628.000000 99.000000
Federal-gov count 960.000000 960.000000 960.000000 960.000000 960.000000 960.000000
mean 42.590625 833.232292 112.268750 10.973958 185221.243750 41.379167
std 11.509171 4101.966767 453.504623 2.113650 117502.359524 8.838605
min 17.000000 0.000000 0.000000 3.000000 19914.000000 4.000000
25% 34.000000 0.000000 0.000000 9.000000 97781.250000 40.000000
50% 43.000000 0.000000 0.000000 10.000000 175771.000000 40.000000
75% 51.000000 0.000000 0.000000 13.000000 243960.250000 40.000000
max 90.000000 99999.000000 3683.000000 16.000000 930948.000000 99.000000
Local-gov count 2093.000000 2093.000000 2093.000000 2093.000000 2093.000000 2093.000000
mean 41.751075 880.202580 109.854276 11.042045 188639.712852 40.982800
std 12.272856 5775.043442 439.513203 2.552536 100254.775314 10.771559
min 17.000000 0.000000 0.000000 1.000000 14878.000000 2.000000
25% 32.000000 0.000000 0.000000 9.000000 121124.000000 40.000000
50% 41.000000 0.000000 0.000000 11.000000 179580.000000 40.000000
75% 50.000000 0.000000 0.000000 13.000000 236487.000000 44.000000
max 90.000000 99999.000000 2444.000000 16.000000 1125613.000000 99.000000
Never-worked count 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000
mean 20.571429 0.000000 0.000000 7.428571 225989.571429 28.428571
std 4.613644 0.000000 0.000000 2.299068 108135.748347 15.186147
min 17.000000 0.000000 0.000000 4.000000 153663.000000 4.000000
25% 18.000000 0.000000 0.000000 6.000000 166902.000000 20.000000
50% 18.000000 0.000000 0.000000 7.000000 188535.000000 35.000000
... ... ... ... ... ... ... ...
Self-emp-inc std 12.553194 17976.548086 549.488497 2.603210 96436.282913 13.900417
min 17.000000 0.000000 0.000000 2.000000 21626.000000 1.000000
25% 37.000000 0.000000 0.000000 9.000000 113539.750000 40.000000
50% 45.000000 0.000000 0.000000 10.000000 165667.000000 50.000000
75% 54.000000 0.000000 0.000000 13.000000 213722.750000 60.000000
max 84.000000 99999.000000 2559.000000 16.000000 1097453.000000 99.000000
Self-emp-not-inc count 2541.000000 2541.000000 2541.000000 2541.000000 2541.000000 2541.000000
mean 44.969697 1886.061787 116.631641 10.226289 175608.641480 44.421881
std 13.338162 10986.233506 467.611687 2.768132 100735.757730 16.674958
min 17.000000 0.000000 0.000000 2.000000 20098.000000 1.000000
25% 35.000000 0.000000 0.000000 9.000000 104973.000000 40.000000
50% 44.000000 0.000000 0.000000 10.000000 168109.000000 40.000000
75% 54.000000 0.000000 0.000000 13.000000 227298.000000 50.000000
max 90.000000 99999.000000 2824.000000 16.000000 795830.000000 99.000000
State-gov count 1298.000000 1298.000000 1298.000000 1298.000000 1298.000000 1298.000000
mean 39.436055 701.699538 83.256549 11.375963 184136.613251 39.031587
std 12.431065 3777.749185 394.469789 2.538604 111512.980926 11.697014
min 17.000000 0.000000 0.000000 1.000000 19395.000000 1.000000
25% 30.000000 0.000000 0.000000 9.000000 108903.750000 38.000000
50% 39.000000 0.000000 0.000000 10.000000 169402.500000 40.000000
75% 48.000000 0.000000 0.000000 13.000000 238532.750000 40.000000
max 81.000000 99999.000000 3683.000000 16.000000 1033222.000000 99.000000
Without-pay count 14.000000 14.000000 14.000000 14.000000 14.000000 14.000000
mean 47.785714 487.857143 0.000000 9.071429 174267.500000 32.714286
std 21.075610 1300.780467 0.000000 1.685426 85536.385921 17.357900
min 19.000000 0.000000 0.000000 4.000000 27012.000000 10.000000
25% 23.750000 0.000000 0.000000 9.000000 138446.500000 20.000000
50% 57.000000 0.000000 0.000000 9.000000 171531.500000 27.500000
75% 65.000000 0.000000 0.000000 9.750000 209006.500000 47.500000
max 72.000000 4416.000000 0.000000 12.000000 344858.000000 65.000000

72 rows × 6 columns

In [23]:
race=adult.groupby("race")
In [24]:
race.sum()
Out[24]:
age fnlwgt education-num capital-gain capital-loss hours-per-week
race
Amer-Indian-Eskimo 11561 37578487 2896 194458 10629 12455
Asian-Pac-Islander 39219 166178293 11388 1536014 101014 41692
Black 117987 712313000 29635 1905454 188643 120033
Other 9067 53420656 2396 253293 16550 10696
White 1078423 5209882956 281922 31200105 2525864 1131808
In [25]:
race.mean()
Out[25]:
age fnlwgt education-num capital-gain capital-loss hours-per-week
race
Amer-Indian-Eskimo 37.173633 120831.147910 9.311897 625.266881 34.176849 40.048232
Asian-Pac-Islander 37.746872 159940.609240 10.960539 1478.358037 97.222329 40.127045
Black 37.767926 228013.124200 9.486236 609.940461 60.385083 38.422855
Other 33.457565 197124.191882 8.841328 934.660517 61.070111 39.468635
White 38.769881 187298.064280 10.135246 1121.660375 90.806155 40.689100
In [26]:
pd.crosstab(adult.race, adult.workclass)
Out[26]:
workclass ? Federal-gov Local-gov Never-worked Private Self-emp-inc Self-emp-not-inc State-gov Without-pay
race
Amer-Indian-Eskimo 25 19 36 0 190 2 24 15 0
Asian-Pac-Islander 65 44 39 0 713 46 73 58 1
Black 213 169 288 2 2176 23 93 159 1
Other 23 7 10 0 213 5 9 4 0
White 1510 721 1720 5 19404 1040 2342 1062 12
In [27]:
pd.crosstab(adult.race, adult.sex)
Out[27]:
sex Female Male
race
Amer-Indian-Eskimo 119 192
Asian-Pac-Islander 346 693
Black 1555 1569
Other 109 162
White 8642 19174
In [28]:
pd.crosstab(adult.income, adult.sex)
Out[28]:
sex Female Male
income
<=50K 9592 15128
>50K 1179 6662
In [29]:
pd.crosstab(adult.income, adult.race)
Out[29]:
race Amer-Indian-Eskimo Asian-Pac-Islander Black Other White
income
<=50K 275 763 2737 246 20699
>50K 36 276 387 25 7117
In [35]:
adult.corr(method='pearson', min_periods=1)
Out[35]:
age fnlwgt education-num capital-gain capital-loss hours-per-week
age 1.000000 -0.076646 0.036527 0.077674 0.057775 0.068756
fnlwgt -0.076646 1.000000 -0.043195 0.000432 -0.010252 -0.018768
education-num 0.036527 -0.043195 1.000000 0.122630 0.079923 0.148123
capital-gain 0.077674 0.000432 0.122630 1.000000 -0.031615 0.078409
capital-loss 0.057775 -0.010252 0.079923 -0.031615 1.000000 0.054256
hours-per-week 0.068756 -0.018768 0.148123 0.078409 0.054256 1.000000
In [ ]: