import pandas as pd # pandas will be referred to as pd below
import numpy
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
elec_use= pd.read_csv('electricity_use_per_person.csv')
median_age= pd.read_csv('median_age_years.csv')
elec_use = elec_use[['country', '1975','1980','1985','1990','1995','2000','2005','2010']]
elec_use = elec_use.dropna()
median_age = median_age[['country','1975','1980' ,'1985','1990','1995','2000','2005','2010']]
melt1 = pd.melt(elec_use, id_vars=["country"], value_vars=['1975','1980' ,'1985','1990','1995','2000','2005','2010'], value_name='Elec_usage')
melt2 = pd.melt(median_age, id_vars=["country"], value_vars=['1975','1980' ,'1985','1990','1995','2000','2005','2010'], value_name='Median_Age')
#melt2.set_index('country',inplace=True)
#melt2
#melt2[melt2["country"] == 'Turkey']
melt1[melt1["country"] == 'Turkey']
country | variable | Elec_usage | |
---|---|---|---|
99 | Turkey | 1975 | 359.0 |
208 | Turkey | 1980 | 496.0 |
317 | Turkey | 1985 | 660.0 |
426 | Turkey | 1990 | 930.0 |
535 | Turkey | 1995 | 1230.0 |
644 | Turkey | 2000 | 1650.0 |
753 | Turkey | 2005 | 2010.0 |
862 | Turkey | 2010 | 2490.0 |
melt2[melt2["country"] == 'Turkey']
country | variable | Median_Age | |
---|---|---|---|
169 | Turkey | 1975 | 19.6 |
353 | Turkey | 1980 | 20.0 |
537 | Turkey | 1985 | 21.0 |
721 | Turkey | 1990 | 22.1 |
905 | Turkey | 1995 | 23.5 |
1089 | Turkey | 2000 | 24.9 |
1273 | Turkey | 2005 | 26.6 |
1457 | Turkey | 2010 | 28.3 |
#melt1.set_index('country',inplace=True)
#melt2.set_index('country',inplace=True)
merged= pd.merge(melt1, melt2,how='inner', left_on=['country','variable'], right_on=['country','variable'])
merged[merged["country"] == 'Turkey']
country | variable | Elec_usage | Median_Age | |
---|---|---|---|---|
99 | Turkey | 1975 | 359.0 | 19.6 |
208 | Turkey | 1980 | 496.0 | 20.0 |
317 | Turkey | 1985 | 660.0 | 21.0 |
426 | Turkey | 1990 | 930.0 | 22.1 |
535 | Turkey | 1995 | 1230.0 | 23.5 |
644 | Turkey | 2000 | 1650.0 | 24.9 |
753 | Turkey | 2005 | 2010.0 | 26.6 |
862 | Turkey | 2010 | 2490.0 | 28.3 |
data = merged.rename(columns={'variable': 'Year', 'country': 'Country'})
data
Country | Year | Elec_usage | Median_Age | |
---|---|---|---|---|
0 | Albania | 1975 | 739.0 | 20.2 |
1 | Algeria | 1975 | 195.0 | 16.5 |
2 | Angola | 1975 | 127.0 | 16.6 |
3 | Argentina | 1975 | 1000.0 | 27.4 |
4 | Australia | 1975 | 4780.0 | 28.1 |
5 | Austria | 1975 | 3730.0 | 33.9 |
6 | Bahrain | 1975 | 2630.0 | 19.3 |
7 | Bangladesh | 1975 | 17.0 | 17.6 |
8 | Belgium | 1975 | 3890.0 | 34.3 |
9 | Benin | 1975 | 18.1 | 18.3 |
10 | Bolivia | 1975 | 189.0 | 18.6 |
11 | Brazil | 1975 | 649.0 | 19.5 |
12 | Brunei | 1975 | 1480.0 | 19.6 |
13 | Bulgaria | 1975 | 3040.0 | 33.7 |
14 | Cameroon | 1975 | 175.0 | 18.4 |
15 | Canada | 1975 | 10400.0 | 27.5 |
16 | Chile | 1975 | 742.0 | 21.5 |
17 | China | 1975 | 199.0 | 20.4 |
18 | Colombia | 1975 | 442.0 | 17.8 |
19 | Congo, Dem. Rep. | 1975 | 167.0 | 17.8 |
20 | Congo, Rep. | 1975 | 64.1 | 17.9 |
21 | Costa Rica | 1975 | 730.0 | 19.3 |
22 | Cote d'Ivoire | 1975 | 122.0 | 17.6 |
23 | Cuba | 1975 | 623.0 | 22.3 |
24 | Cyprus | 1975 | 1030.0 | 27.4 |
25 | Czech Republic | 1975 | 4030.0 | 32.7 |
26 | Denmark | 1975 | 3410.0 | 33.0 |
27 | Dominican Republic | 1975 | 402.0 | 17.0 |
28 | Ecuador | 1975 | 196.0 | 18.1 |
29 | Egypt | 1975 | 236.0 | 19.2 |
... | ... | ... | ... | ... |
842 | Portugal | 2010 | 4960.0 | 41.6 |
843 | Qatar | 2010 | 14800.0 | 31.8 |
844 | Romania | 2010 | 2550.0 | 39.4 |
845 | Saudi Arabia | 2010 | 7970.0 | 25.9 |
846 | Senegal | 2010 | 199.0 | 18.1 |
847 | Singapore | 2010 | 8680.0 | 37.3 |
848 | Slovak Republic | 2010 | 5200.0 | 37.3 |
849 | South Africa | 2010 | 4510.0 | 24.8 |
850 | South Korea | 2010 | 9720.0 | 38.0 |
851 | Spain | 2010 | 5710.0 | 40.6 |
852 | Sri Lanka | 2010 | 461.0 | 30.4 |
853 | Sudan | 2010 | 131.0 | 18.3 |
854 | Sweden | 2010 | 14900.0 | 40.7 |
855 | Switzerland | 2010 | 8170.0 | 41.6 |
856 | Syria | 2010 | 1850.0 | 21.5 |
857 | Tanzania | 2010 | 93.0 | 17.2 |
858 | Thailand | 2010 | 2310.0 | 35.5 |
859 | Togo | 2010 | 123.0 | 18.6 |
860 | Trinidad and Tobago | 2010 | 6190.0 | 31.9 |
861 | Tunisia | 2010 | 1360.0 | 29.2 |
862 | Turkey | 2010 | 2490.0 | 28.3 |
863 | United Arab Emirates | 2010 | 11000.0 | 31.9 |
864 | United Kingdom | 2010 | 5700.0 | 39.6 |
865 | United States | 2010 | 13400.0 | 36.9 |
866 | Uruguay | 2010 | 2800.0 | 33.7 |
867 | Venezuela | 2010 | 3130.0 | 25.8 |
868 | Vietnam | 2010 | 1020.0 | 28.5 |
869 | Yemen | 2010 | 250.0 | 18.0 |
870 | Zambia | 2010 | 580.0 | 16.5 |
871 | Zimbabwe | 2010 | 547.0 | 18.6 |
872 rows × 4 columns
data[data["Country"] == 'Turkey']
Country | Year | Elec_usage | Median_Age | ElecGroup | AgeGroup | AGE | |
---|---|---|---|---|---|---|---|
99 | Turkey | 1975 | 359.0 | 19.6 | 2.0 | (18, 25] | 0 |
208 | Turkey | 1980 | 496.0 | 20.0 | 2.0 | (18, 25] | 0 |
317 | Turkey | 1985 | 660.0 | 21.0 | 2.0 | (18, 25] | 0 |
426 | Turkey | 1990 | 930.0 | 22.1 | 2.0 | (18, 25] | 0 |
535 | Turkey | 1995 | 1230.0 | 23.5 | 3.0 | (18, 25] | 0 |
644 | Turkey | 2000 | 1650.0 | 24.9 | 3.0 | (18, 25] | 1 |
753 | Turkey | 2005 | 2010.0 | 26.6 | 3.0 | (25, 32] | 1 |
862 | Turkey | 2010 | 2490.0 | 28.3 | 3.0 | (25, 32] | 1 |
print ('Elec. use per person ranges from '+ str(data['Elec_usage'].min())+' and to '+ str(data['Elec_usage'].max()) )
data['Elec_usage'].describe()
Elec. use per person ranges from 6.68 and to 51400.0
count 872.000000 mean 3270.140115 std 4805.859468 min 6.680000 25% 314.000000 50% 1175.000000 75% 4567.500000 max 51400.000000 Name: Elec_usage, dtype: float64
# Elec. use per person column keeps continuous quantitative variables.
# In order to create Elec. use per person groups, we need to generate partitions.
# Below, we call the "cut " method. Note that the first cut is (7, 300]
data['ElecGroup']= pd.cut(data.Elec_usage, [7,300,1000,5000,10000,20000,55000])
# no need to sort the data frame.
print(len(data))
print(len(data['ElecGroup']))
872 872
data['ElecGroup']= data['ElecGroup'].astype('category')
data['ElecGroup'].describe()
count 871 unique 6 top (1000, 5000] freq 273 Name: ElecGroup, dtype: object
ElecGroup= data['ElecGroup'].value_counts(sort= False, normalize= True)
# value_counts() should make more sense now
print(ElecGroup)
(7, 300] 0.246843 (300, 1000] 0.219288 (1000, 5000] 0.313433 (5000, 10000] 0.138921 (10000, 20000] 0.068886 (20000, 55000] 0.012629 Name: ElecGroup, dtype: float64
seaborn.countplot(x= 'ElecGroup', data= data)
plt.xlabel('ElecGroup (per capita) categories)')
plt.title('Counts of rows for each ElecGroup')
plt.show()
seaborn.distplot(data['Elec_usage'].dropna(), kde= False)
plt.xlabel('ElecGroup (per person) quantitative values)')
plt.title('Counts of rows for each ElecGroup')
Text(0.5, 1.0, 'Counts of rows for each ElecGroup')
print ('Median Age ranges from '+ str(data['Median_Age'].min())+' and to '+ str(data['Median_Age'].max()) )
data['Median_Age'].describe()
Median Age ranges from 14.3 and to 44.7
count 872.000000 mean 25.344839 std 7.639786 min 14.300000 25% 18.500000 50% 23.250000 75% 31.800000 max 44.700000 Name: Median_Age, dtype: float64
data['AgeGroup']= pd.cut(data.Median_Age, [8,18,25,32,40,50])
data['AgeGroup']= data['AgeGroup'].astype('category')
data['AgeGroup'].describe()
count 872 unique 5 top (18, 25] freq 303 Name: AgeGroup, dtype: object
MG= data['AgeGroup'].value_counts(sort= False, normalize= True)
print(MG)
(8, 18] 0.213303 (18, 25] 0.347477 (25, 32] 0.194954 (32, 40] 0.213303 (40, 50] 0.030963 Name: AgeGroup, dtype: float64
seaborn.countplot(x= 'AgeGroup', data= data)
plt.xlabel('Median Age group')
plt.title('Counts of rows for each Rate')
plt.show()
seaborn.regplot(x= "Median_Age", y= "Elec_usage", fit_reg= False, data= data)
<matplotlib.axes._subplots.AxesSubplot at 0x2645cf5f3c8>
MG= data['AgeGroup'].value_counts(sort= False, normalize= True)
print(MG)
(8, 18] 0.213303 (18, 25] 0.347477 (25, 32] 0.194954 (32, 40] 0.213303 (40, 50] 0.030963 Name: AgeGroup, dtype: float64
def AGE (row):
if row['Median_Age'] < 24 :
return 0
else:
return 1
data['AGE']= data.apply(lambda row : AGE(row), axis= 1)
# axis=1, tells python to apply this function to each row
# Arbitrary functions can be applied along the axes of a DataFrame using the apply() method
data['AGE'].value_counts()
0 455 1 417 Name: AGE, dtype: int64
seaborn.factorplot(x= 'ElecGroup', y= 'AGE', data= data, kind= "bar", ci=None)
plt.xlabel('ElecGroup per person')
plt.ylabel('median age over 24')
plt.show()
# A categorical to categorical bar chart
# note that a bivariate graph displays a mean on the y-axis,
# so categorical response variables should not have other than two levels, they should be coded as 1 and 0.
# Important: Below, the response variable gives us the proportion of the positive observations.
#7,300,1000,5000,10000,20000,55000
def ElecRangeGroup (row):
if row['Elec_usage'] > 7.0 and row['Elec_usage']<= 300.0 :
return 1
if row['Elec_usage'] > 300.0 and row['Elec_usage']<= 1000.0 :
return 2
if row['Elec_usage'] > 1000.0 and row['Elec_usage']<= 5000.0 :
return 3
if row['Elec_usage'] > 5000.0 and row['Elec_usage']<= 10000.0 :
return 4
if row['Elec_usage'] > 10000.0 and row['Elec_usage']<= 20000.0 :
return 5
if row['Elec_usage'] > 20000.0 and row['Elec_usage']<= 55000.0 :
return 6
data['ElecGroup'] = data.apply (lambda row: ElecRangeGroup (row),axis=1)
seaborn.factorplot(x= 'ElecGroup', y= 'AGE', data= data, kind= "bar", ci=None)
plt.xlabel('Elec. use per person catagory')
plt.ylabel('Median age over 24')
plt.show()
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
data3= data[['ElecGroup', 'Median_Age']]
print(data3)
ElecGroup Median_Age 0 2.0 20.2 1 1.0 16.5 2 1.0 16.6 3 2.0 27.4 4 3.0 28.1 5 3.0 33.9 6 3.0 19.3 7 1.0 17.6 8 3.0 34.3 9 1.0 18.3 10 1.0 18.6 11 2.0 19.5 12 3.0 19.6 13 3.0 33.7 14 1.0 18.4 15 5.0 27.5 16 2.0 21.5 17 1.0 20.4 18 2.0 17.8 19 1.0 17.8 20 1.0 17.9 21 2.0 19.3 22 1.0 17.6 23 2.0 22.3 24 3.0 27.4 25 3.0 32.7 26 3.0 33.0 27 2.0 17.0 28 1.0 18.1 29 1.0 19.2 .. ... ... 842 3.0 41.6 843 5.0 31.8 844 3.0 39.4 845 4.0 25.9 846 1.0 18.1 847 4.0 37.3 848 4.0 37.3 849 3.0 24.8 850 4.0 38.0 851 4.0 40.6 852 2.0 30.4 853 1.0 18.3 854 5.0 40.7 855 4.0 41.6 856 3.0 21.5 857 1.0 17.2 858 3.0 35.5 859 1.0 18.6 860 4.0 31.9 861 3.0 29.2 862 3.0 28.3 863 5.0 31.9 864 4.0 39.6 865 5.0 36.9 866 3.0 33.7 867 3.0 25.8 868 3.0 28.5 869 1.0 18.0 870 2.0 16.5 871 2.0 18.6 [872 rows x 2 columns]
model1= smf.ols(formula='Median_Age ~ C(ElecGroup)', data=data3)
# statsmodels.formula.api as smf
# ols: ordinary least squares regression
# first response var, and then explanatory variable. Note that C stands for the Categorical variable.
results= model1.fit()
print(results.summary())
OLS Regression Results ============================================================================== Dep. Variable: Median_Age R-squared: 0.636 Model: OLS Adj. R-squared: 0.634 Method: Least Squares F-statistic: 302.0 Date: Mon, 01 Apr 2019 Prob (F-statistic): 6.38e-187 Time: 01:14:26 Log-Likelihood: -2566.7 No. Observations: 871 AIC: 5145. Df Residuals: 865 BIC: 5174. Df Model: 5 Covariance Type: nonrobust ======================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------------- Intercept 18.1972 0.315 57.695 0.000 17.578 18.816 C(ElecGroup)[T.2.0] 2.5452 0.460 5.535 0.000 1.643 3.448 C(ElecGroup)[T.3.0] 9.7108 0.422 23.028 0.000 8.883 10.539 C(ElecGroup)[T.4.0] 16.4268 0.526 31.254 0.000 15.395 17.458 C(ElecGroup)[T.5.0] 15.5161 0.675 22.979 0.000 14.191 16.841 C(ElecGroup)[T.6.0] 15.9755 1.430 11.175 0.000 13.170 18.781 ============================================================================== Omnibus: 2.739 Durbin-Watson: 2.198 Prob(Omnibus): 0.254 Jarque-Bera (JB): 2.859 Skew: 0.047 Prob(JB): 0.239 Kurtosis: 3.264 Cond. No. 10.0 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
mc1 = multi.MultiComparison(data3['Median_Age'], data3['ElecGroup'])
res1 = mc1.tukeyhsd()
print(res1.summary())
Multiple Comparison of Means - Tukey HSD,FWER=0.05 ============================================== group1 group2 meandiff lower upper reject ---------------------------------------------- 1.0 2.0 2.5452 1.1862 3.9042 True 1.0 3.0 9.7108 8.4646 10.9571 True 1.0 4.0 16.4268 14.8735 17.98 True 1.0 5.0 15.5161 13.5206 17.5116 True 1.0 6.0 15.9755 11.7506 20.2004 True 1.0 nan 1.2028 -12.4962 14.9018 False 2.0 3.0 7.1657 5.8764 8.4549 True 2.0 4.0 13.8816 12.2936 15.4696 True 2.0 5.0 12.9709 10.9483 14.9936 True 2.0 6.0 13.4303 9.1925 17.6682 True 2.0 nan -1.3424 -15.0454 12.3606 False 3.0 4.0 6.7159 5.2233 8.2085 True 3.0 5.0 5.8053 3.8566 7.754 True 3.0 6.0 6.2647 2.0616 10.4677 True 3.0 nan -8.5081 -22.2003 5.1842 False 4.0 5.0 -0.9106 -3.0686 1.2474 False 4.0 6.0 -0.4512 -4.7553 3.8528 False 4.0 nan -15.224 -28.9476 -1.5004 True 5.0 6.0 0.4594 -4.0233 4.9421 False 5.0 nan -14.3133 -28.094 -0.5327 True 6.0 nan -14.7727 -29.0477 -0.4978 True ----------------------------------------------
data4= data[['AGE', 'ElecGroup']]
print(data4)
AGE ElecGroup 0 0 2.0 1 0 1.0 2 0 1.0 3 1 2.0 4 1 3.0 5 1 3.0 6 0 3.0 7 0 1.0 8 1 3.0 9 0 1.0 10 0 1.0 11 0 2.0 12 0 3.0 13 1 3.0 14 0 1.0 15 1 5.0 16 0 2.0 17 0 1.0 18 0 2.0 19 0 1.0 20 0 1.0 21 0 2.0 22 0 1.0 23 0 2.0 24 1 3.0 25 1 3.0 26 1 3.0 27 0 2.0 28 0 1.0 29 0 1.0 .. ... ... 842 1 3.0 843 1 5.0 844 1 3.0 845 1 4.0 846 0 1.0 847 1 4.0 848 1 4.0 849 1 3.0 850 1 4.0 851 1 4.0 852 1 2.0 853 0 1.0 854 1 5.0 855 1 4.0 856 0 3.0 857 0 1.0 858 1 3.0 859 0 1.0 860 1 4.0 861 1 3.0 862 1 3.0 863 1 5.0 864 1 4.0 865 1 5.0 866 1 3.0 867 1 3.0 868 1 3.0 869 0 1.0 870 0 2.0 871 0 2.0 [872 rows x 2 columns]
# contingency table of observed counts
ct1=pd.crosstab(data4['AGE'], data4['ElecGroup'])
print (ct1) # ct1 is a two-dimentional array
ElecGroup 1.0 2.0 3.0 4.0 5.0 6.0 AGE 0 209 158 81 6 0 0 1 6 33 192 115 60 11
# column percentages
colsum=ct1.sum(axis=0) # axis=0 to sum all columns
colpct=ct1/colsum
print(colpct)
ElecGroup 1.0 2.0 3.0 4.0 5.0 6.0 AGE 0 0.972093 0.827225 0.296703 0.049587 0.0 0.0 1 0.027907 0.172775 0.703297 0.950413 1.0 1.0
import scipy.stats
# chi-square
print ('chi-square value, p value, expected counts')
cs1= scipy.stats.chi2_contingency(ct1)
print (cs1)
chi-square value, p value, expected counts (487.1052465055917, 4.84579097311198e-103, 5, array([[112.06659013, 99.55683123, 142.29850746, 63.07003444, 31.27439724, 5.73363949], [102.93340987, 91.44316877, 130.70149254, 57.92996556, 28.72560276, 5.26636051]]))
seaborn.factorplot(x= 'ElecGroup', y= 'AGE', data= data4, kind= "bar", ci=None)
plt.xlabel('elec. usage per person category: 1 the most poor to 6 the richest')
plt.ylabel('Median age levels')
plt.show()
# note that the y values are equal to the column percentages (colpct) for Mortal 1
Here, we have 6 groups and there are 15 pairs; so the adjusted p-value is 0.05/15= 0,0033
# We need to run chi-square tests for each of the comparisons
recode_1v2 = {1: 1, 2: 2} # this is required because we are going to use only these two columns
data4['Elec_1v2']= data4['ElecGroup'].map(recode_1v2)
# contingency table of observed counts
table_1v2=pd.crosstab(data4['AGE'], data4['Elec_1v2'])
print (table_1v2)
# column percentages
colsum=table_1v2.sum(axis=0)
colpct=table_1v2/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_1v2= scipy.stats.chi2_contingency(table_1v2)
print (cs_1v2)
Elec_1v2 1.0 2.0 AGE 0 209 158 1 6 33 Elec_1v2 1.0 2.0 AGE 0 0.972093 0.827225 1 0.027907 0.172775 chi-square value, p value, expected counts (22.806297143283068, 1.7917765380037851e-06, 1, array([[194.34729064, 172.65270936], [ 20.65270936, 18.34729064]]))
recode_1v3 = {1: 1, 3: 3}
data4['Elec_1v3']= data4['ElecGroup'].map(recode_1v3)
# contingency table of observed counts
table_1v3=pd.crosstab(data4['AGE'], data4['Elec_1v3'])
print (table_1v3)
# column percentages
colsum=table_1v3.sum(axis=0)
colpct=table_1v3/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_1v3= scipy.stats.chi2_contingency(table_1v3)
print (cs_1v3)
Elec_1v3 1.0 3.0 AGE 0 209 81 1 6 192 Elec_1v3 1.0 3.0 AGE 0 0.972093 0.296703 1 0.027907 0.703297 chi-square value, p value, expected counts (224.7521622551614, 8.315050688248716e-51, 1, array([[127.76639344, 162.23360656], [ 87.23360656, 110.76639344]]))
recode_1v4 = {1: 1, 4: 4}
data4['Elec_1v4']= data4['ElecGroup'].map(recode_1v4)
# contingency table of observed counts
table_1v4=pd.crosstab(data4['AGE'], data4['Elec_1v4'])
print (table_1v4)
# column percentages
colsum=table_1v4.sum(axis=0)
colpct=table_1v4/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_1v4= scipy.stats.chi2_contingency(table_1v4)
print (cs_1v4)
Elec_1v4 1.0 4.0 AGE 0 209 6 1 6 115 Elec_1v4 1.0 4.0 AGE 0 0.972093 0.049587 1 0.027907 0.950413 chi-square value, p value, expected counts (281.95263018507967, 2.818963008671329e-63, 1, array([[137.57440476, 77.42559524], [ 77.42559524, 43.57440476]]))
recode_1v6 = {1: 1, 6: 6}
data4['Elec_1v6']= data4['ElecGroup'].map(recode_1v6)
# contingency table of observed counts
table_1v6=pd.crosstab(data4['AGE'], data4['Elec_1v6'])
print (table_1v6)
# column percentages
colsum=table_1v6.sum(axis=0)
colpct=table_1v6/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_1v6= scipy.stats.chi2_contingency(table_1v6)
print (cs_1v6)
Elec_1v6 1.0 6.0 AGE 0 209 0 1 6 11 Elec_1v6 1.0 6.0 AGE 0 0.972093 0.0 1 0.027907 1.0 chi-square value, p value, expected counts (128.52345794787362, 8.622214054991668e-30, 1, array([[198.82743363, 10.17256637], [ 16.17256637, 0.82743363]]))
recode_2v3 = {2: 2, 3: 3}
data4['Elec_2v3']= data4['ElecGroup'].map(recode_2v3)
# contingency table of observed counts
table_2v3=pd.crosstab(data4['AGE'], data4['Elec_2v3'])
print (table_2v3)
# column percentages
colsum=table_2v3.sum(axis=0)
colpct=table_2v3/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_2v3= scipy.stats.chi2_contingency(table_2v3)
print (cs_2v3)
Elec_2v3 2.0 3.0 AGE 0 158 81 1 33 192 Elec_2v3 2.0 3.0 AGE 0 0.827225 0.296703 1 0.172775 0.703297 chi-square value, p value, expected counts (124.51590444127238, 6.495642021769712e-29, 1, array([[ 98.38146552, 140.61853448], [ 92.61853448, 132.38146552]]))
recode_2v4 = {2: 2, 4: 4}
data4['Elec_2v4']= data4['ElecGroup'].map(recode_2v4)
# contingency table of observed counts
table_2v4=pd.crosstab(data4['AGE'], data4['Elec_2v4'])
print (table_2v4)
# column percentages
colsum=table_2v4.sum(axis=0)
colpct=table_2v4/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_2v4= scipy.stats.chi2_contingency(table_2v4)
print (cs_2v4)
Elec_2v4 2.0 4.0 AGE 0 158 6 1 33 115 Elec_2v4 2.0 4.0 AGE 0 0.827225 0.049587 1 0.172775 0.950413 chi-square value, p value, expected counts (176.54308080172652, 2.755787423749102e-40, 1, array([[100.3974359, 63.6025641], [ 90.6025641, 57.3974359]]))
recode_2v5 = {2: 2, 5: 5}
data4['Elec_2v5']= data4['ElecGroup'].map(recode_2v5)
# contingency table of observed counts
table_2v5=pd.crosstab(data4['AGE'], data4['Elec_2v5'])
print (table_2v5)
# column percentages
colsum=table_2v5.sum(axis=0)
colpct=table_2v5/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_2v5= scipy.stats.chi2_contingency(table_2v5)
print (cs_2v5)
Elec_2v5 2.0 5.0 AGE 0 158 0 1 33 60 Elec_2v5 2.0 5.0 AGE 0 0.827225 0.0 1 0.172775 1.0 chi-square value, p value, expected counts (130.43382402256765, 3.293304194760401e-30, 1, array([[120.2310757, 37.7689243], [ 70.7689243, 22.2310757]]))
recode_2v6 = {2: 2, 6: 6}
data4['Elec_2v6']= data4['ElecGroup'].map(recode_2v6)
# contingency table of observed counts
table_2v6=pd.crosstab(data4['AGE'], data4['Elec_2v6'])
print (table_2v6)
# column percentages
colsum=table_2v6.sum(axis=0)
colpct=table_2v6/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_2v6= scipy.stats.chi2_contingency(table_2v6)
print (cs_2v6)
Elec_2v6 2.0 6.0 AGE 0 158 0 1 33 11 Elec_2v6 2.0 6.0 AGE 0 0.827225 0.0 1 0.172775 1.0 chi-square value, p value, expected counts (37.06063979068546, 1.1451202748956859e-09, 1, array([[149.3960396, 8.6039604], [ 41.6039604, 2.3960396]]))
recode_3v4 = {3: 3, 4: 4}
data4['Elec_3v4']= data4['ElecGroup'].map(recode_3v4)
# contingency table of observed counts
table_3v4=pd.crosstab(data4['AGE'], data4['Elec_3v4'])
print (table_3v4)
# column percentages
colsum=table_3v4.sum(axis=0)
colpct=table_3v4/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_3v4= scipy.stats.chi2_contingency(table_3v4)
print (cs_3v4)
Elec_3v4 3.0 4.0 AGE 0 81 6 1 192 115 Elec_3v4 3.0 4.0 AGE 0 0.296703 0.049587 1 0.703297 0.950413 chi-square value, p value, expected counts (28.338129494062635, 1.0186955390669189e-07, 1, array([[ 60.28172589, 26.71827411], [212.71827411, 94.28172589]]))
recode_3v5 = {3: 3, 5: 5}
data4['Elec_3v5']= data4['ElecGroup'].map(recode_3v5)
# contingency table of observed counts
table_3v5=pd.crosstab(data4['AGE'], data4['Elec_3v5'])
print (table_3v5)
# column percentages
colsum=table_3v5.sum(axis=0)
colpct=table_3v5/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_3v5= scipy.stats.chi2_contingency(table_3v5)
print (cs_3v5)
Elec_3v5 3.0 5.0 AGE 0 81 0 1 192 60 Elec_3v5 3.0 5.0 AGE 0 0.296703 0.0 1 0.703297 1.0 chi-square value, p value, expected counts (21.94009081196581, 2.8129558697502437e-06, 1, array([[ 66.40540541, 14.59459459], [206.59459459, 45.40540541]]))
recode_3v6 = {3: 3, 6: 6}
data4['Elec_3v6']= data4['ElecGroup'].map(recode_3v6)
# contingency table of observed counts
table_3v6=pd.crosstab(data4['AGE'], data4['Elec_3v6'])
print (table_3v6)
# column percentages
colsum=table_3v6.sum(axis=0)
colpct=table_3v6/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_3v6= scipy.stats.chi2_contingency(table_3v6)
print (cs_3v6)
Elec_3v6 3.0 6.0 AGE 0 81 0 1 192 11 Elec_3v6 3.0 6.0 AGE 0 0.296703 0.0 1 0.703297 1.0 chi-square value, p value, expected counts (3.2266033951857707, 0.07245081307991667, 1, array([[ 77.86267606, 3.13732394], [195.13732394, 7.86267606]]))
recode_4v5 = {4: 4, 5: 5}
data4['Elec_4v5']= data4['ElecGroup'].map(recode_4v5)
# contingency table of observed counts
table_4v5=pd.crosstab(data4['AGE'], data4['Elec_4v5'])
print (table_4v5)
# column percentages
colsum=table_4v5.sum(axis=0)
colpct=table_4v5/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_4v5= scipy.stats.chi2_contingency(table_4v5)
print (cs_4v5)
Elec_4v5 4.0 5.0 AGE 0 6 0 1 115 60 Elec_4v5 4.0 5.0 AGE 0 0.049587 0.0 1 0.950413 1.0 chi-square value, p value, expected counts (1.7245277777777779, 0.189111262313004, 1, array([[ 4.01104972, 1.98895028], [116.98895028, 58.01104972]]))
recode_4v6 = {4: 4, 6: 6}
data4['Elec_4v6']= data4['ElecGroup'].map(recode_4v6)
# contingency table of observed counts
table_4v6=pd.crosstab(data4['AGE'], data4['Elec_4v6'])
print (table_4v6)
# column percentages
colsum=table_4v6.sum(axis=0)
colpct=table_4v6/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_4v6= scipy.stats.chi2_contingency(table_4v6)
print (cs_4v6)
Elec_4v6 4.0 6.0 AGE 0 6 0 1 115 11 Elec_4v6 4.0 6.0 AGE 0 0.049587 0.0 1 0.950413 1.0 chi-square value, p value, expected counts (0.0, 1.0, 1, array([[ 5.5, 0.5], [115.5, 10.5]]))
recode_5v6 = {5: 5, 6: 6}
data4['Elec_5v6']= data4['ElecGroup'].map(recode_5v6)
# contingency table of observed counts
table_5v6=pd.crosstab(data4['AGE'], data4['Elec_5v6'])
print (table_5v6)
# column percentages
colsum=table_5v6.sum(axis=0)
colpct=table_5v6/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs_5v6= scipy.stats.chi2_contingency(table_5v6)
print (cs_5v6)
Elec_5v6 5.0 6.0 AGE 1 60 11 Elec_5v6 5.0 6.0 AGE 1 1 1 chi-square value, p value, expected counts (0.0, 1.0, 0, array([[60., 11.]]))
seaborn.regplot(x= "Elec_usage", y= "Median_Age", fit_reg= False, data= data)
<matplotlib.axes._subplots.AxesSubplot at 0x2645ee96160>
print('association between Elec. usage pre person and Median Age')
print(scipy.stats.pearsonr(data['Elec_usage'], data['Median_Age']))
association between Elec. usage pre person and Median Age (0.617638613515279, 7.588818646419762e-93)
As we can see from the scatter plot above, mortality rate decreases as the GDP increases. However this correlation is not very strong. The p-value suggests that the computed coefficient is significant.