# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze, settings
from datascienceutils import predictiveModels as pm
from datascienceutils import sklearnUtils as sku
settings.MODELS_BASE_PATH='../models'
# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random
from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook(bokeh.resources.INLINE)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
irisDf = pd.read_csv('./data/problem_dataset.csv')
irisDf.describe()
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile RuntimeWarning)
OCCURRENCE_ID | Y | X | GAUL_AD0 | infection_time | |
---|---|---|---|---|---|
count | 42066.000000 | 42066.000000 | 42066.000000 | 4.206600e+04 | 42093.000000 |
mean | 21033.500000 | 13.124483 | 63.094441 | 3.986041e+04 | 5.469627 |
std | 12143.552549 | 17.566448 | 82.068577 | 1.953122e+05 | 2.869416 |
min | 1.000000 | -38.950000 | -179.980000 | 3.000000e+00 | 1.000000 |
25% | NaN | NaN | NaN | NaN | 3.000000 |
50% | NaN | NaN | NaN | NaN | 5.000000 |
75% | NaN | NaN | NaN | NaN | 8.000000 |
max | 42066.000000 | 52.320000 | 179.860000 | 1.013965e+06 | 10.000000 |
irisDf.head()
Unnamed: 0 | VECTOR | OCCURRENCE_ID | SOURCE_TYPE | LOCATION_TYPE | POLYGON_ADMIN | Y | X | YEAR | COUNTRY | COUNTRY_ID | GAUL_AD0 | STATUS | infection_source | infection_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | Aedes aegypti | 1.0 | published | point | -999 | -3.22 | 40.07 | 1958 | Kenya | KEN | 133.0 | NaN | air | 9 |
1 | 1 | Aedes aegypti | 2.0 | published | point | -999 | -4.27 | 15.30 | 1960 | Congo | COG | 59.0 | NaN | mosquito | 5 |
2 | 2 | Aedes aegypti | 3.0 | published | point | -999 | -4.27 | 15.30 | 1960 | Congo | COG | 59.0 | NaN | mosquito | 7 |
3 | 3 | Aedes aegypti | 4.0 | published | point | -999 | -3.22 | 40.07 | 1960 | Kenya | KEN | 133.0 | NaN | water | 1 |
4 | 4 | Aedes aegypti | 5.0 | published | point | -999 | -3.04 | 40.14 | 1960 | Kenya | KEN | 133.0 | NaN | water | 2 |
irisDf['POLYGON_ADMIN'].unique()
array(['-999', '2', nan, 'Less than 100km', 'Less than 25km', 'Less than 10km'], dtype=object)
irisDf['STATUS'].fillna('NA', inplace=True)
irisDf['YEAR'].fillna('NA', inplace=True)
irisDf['SOURCE_TYPE'].fillna('NA', inplace=True)
irisDf['POLYGON_ADMIN'].fillna('NA', inplace=True)
irisDf.var()
OCCURRENCE_ID 1.474659e+08 Y 3.085801e+02 X 6.735251e+03 GAUL_AD0 3.814687e+10 infection_time 8.233546e+00 dtype: float64
irisDf.skew()
OCCURRENCE_ID -1.161637e-17 Y -1.079129e+00 X -8.496533e-01 GAUL_AD0 4.771877e+00 infection_time 4.528951e-03 dtype: float64
irisDf.corr()
OCCURRENCE_ID | Y | X | GAUL_AD0 | infection_time | |
---|---|---|---|---|---|
OCCURRENCE_ID | 1.000000 | 0.432515 | 0.396861 | -0.206791 | 0.004040 |
Y | 0.432515 | 1.000000 | 0.566422 | -0.093467 | 0.001087 |
X | 0.396861 | 0.566422 | 1.000000 | -0.052476 | 0.004926 |
GAUL_AD0 | -0.206791 | -0.093467 | -0.052476 | 1.000000 | 0.001704 |
infection_time | 0.004040 | 0.001087 | 0.004926 | 0.001704 | 1.000000 |
irisDf.select_dtypes(include=[np.number]).columns
Index(['OCCURRENCE_ID', 'Y', 'X', 'GAUL_AD0', 'infection_time'], dtype='object')
analyze.correlation_analyze(irisDf, exclude_columns='Id',
categories=['VECTOR', 'SOURCE_TYPE', 'LOCATION_TYPE', 'YEAR' ,
'COUNTRY','POLYGON_ADMIN', 'infection_source'], )
# Correlation btw Numerical Columns
# Correlation btw Columns VECTOR & SOURCE_TYPE by count # Correlation btw Columns VECTOR & LOCATION_TYPE by count # Correlation btw Columns VECTOR & YEAR by count # Correlation btw Columns VECTOR & COUNTRY by count # Correlation btw Columns VECTOR & POLYGON_ADMIN by count # Correlation btw Columns VECTOR & infection_source by count # Correlation btw Columns SOURCE_TYPE & LOCATION_TYPE by count # Correlation btw Columns SOURCE_TYPE & YEAR by count # Correlation btw Columns SOURCE_TYPE & COUNTRY by count # Correlation btw Columns SOURCE_TYPE & POLYGON_ADMIN by count # Correlation btw Columns SOURCE_TYPE & infection_source by count # Correlation btw Columns LOCATION_TYPE & YEAR by count # Correlation btw Columns LOCATION_TYPE & COUNTRY by count # Correlation btw Columns LOCATION_TYPE & POLYGON_ADMIN by count # Correlation btw Columns LOCATION_TYPE & infection_source by count # Correlation btw Columns YEAR & COUNTRY by count # Correlation btw Columns YEAR & POLYGON_ADMIN by count # Correlation btw Columns YEAR & infection_source by count # Correlation btw Columns COUNTRY & POLYGON_ADMIN by count # Correlation btw Columns COUNTRY & infection_source by count # Correlation btw Columns POLYGON_ADMIN & infection_source by count
# Pandas correlation coefficients matrix OCCURRENCE_ID Y X GAUL_AD0 infection_time OCCURRENCE_ID 1.000000 0.432515 0.396861 -0.206791 0.004040 Y 0.432515 1.000000 0.566422 -0.093467 0.001087 X 0.396861 0.566422 1.000000 -0.052476 0.004926 GAUL_AD0 -0.206791 -0.093467 -0.052476 1.000000 0.001704 infection_time 0.004040 0.001087 0.004926 0.001704 1.000000 # Pandas co-variance coefficients matrix OCCURRENCE_ID Y X GAUL_AD0 \ OCCURRENCE_ID 1.474659e+08 92263.603007 395512.897408 -4.904631e+08 Y 9.226360e+04 308.580109 816.584268 -3.206784e+05 X 3.955129e+05 816.584268 6735.251340 -8.411426e+05 GAUL_AD0 -4.904631e+08 -320678.359260 -841142.552660 3.814687e+10 infection_time 1.407915e+02 0.054800 1.159944 9.552904e+02 infection_time OCCURRENCE_ID 140.791513 Y 0.054800 X 1.159944 GAUL_AD0 955.290437 infection_time 8.233546
irisDf.head()
Unnamed: 0 | VECTOR | OCCURRENCE_ID | SOURCE_TYPE | LOCATION_TYPE | POLYGON_ADMIN | Y | X | YEAR | COUNTRY | COUNTRY_ID | GAUL_AD0 | STATUS | infection_source | infection_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | Aedes aegypti | 1.0 | published | point | -999 | -3.22 | 40.07 | 1958 | Kenya | KEN | 133.0 | NA | air | 9 |
1 | 1 | Aedes aegypti | 2.0 | published | point | -999 | -4.27 | 15.30 | 1960 | Congo | COG | 59.0 | NA | mosquito | 5 |
2 | 2 | Aedes aegypti | 3.0 | published | point | -999 | -4.27 | 15.30 | 1960 | Congo | COG | 59.0 | NA | mosquito | 7 |
3 | 3 | Aedes aegypti | 4.0 | published | point | -999 | -3.22 | 40.07 | 1960 | Kenya | KEN | 133.0 | NA | water | 1 |
4 | 4 | Aedes aegypti | 5.0 | published | point | -999 | -3.04 | 40.14 | 1960 | Kenya | KEN | 133.0 | NA | water | 2 |
irisDf.groupby('infection_time').sum()
OCCURRENCE_ID | Y | X | GAUL_AD0 | |
---|---|---|---|---|
infection_time | ||||
1 | 90284470.0 | 56248.357739 | 269714.084304 | 172275162.0 |
2 | 88925877.0 | 55458.632437 | 264169.444308 | 171198874.0 |
3 | 89778629.0 | 56400.023396 | 278216.816323 | 162154520.0 |
4 | 88724893.0 | 54035.926489 | 257929.282360 | 149922566.0 |
5 | 86282326.0 | 55170.807230 | 261861.550212 | 185343399.0 |
6 | 88788955.0 | 55741.108921 | 264776.159121 | 155047903.0 |
7 | 87821753.0 | 54141.033120 | 255267.970528 | 184313228.0 |
8 | 89510917.0 | 57096.985882 | 267621.070332 | 161105159.0 |
9 | 88652217.0 | 54498.301807 | 269906.598588 | 179351188.0 |
10 | 86025174.0 | 53303.329921 | 264667.780173 | 156056123.0 |
analyze.dist_analyze(irisDf)
Variance of OCCURRENCE_ID 147465868.5 Skewness of OCCURRENCE_ID 0.0 Variance of Y 308.580109152 Skewness of Y -1.0791294673 Variance of X 6735.2513405 Skewness of X -0.849653266576 Variance of GAUL_AD0 38146865433.4 Skewness of GAUL_AD0 4.77187706336 Variance of infection_time 8.23354599061 Skewness of infection_time 0.00452895082148 Too many categorise for col: YEAR can't plot pie-chart Too many categorise for col: Unnamed: 0 can't plot pie-chart Too many categorise for col: COUNTRY_ID can't plot pie-chart Too many categorise for col: COUNTRY can't plot pie-chart Too many categorise for col: LOCATION_TYPE can't plot pie-chart
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:262: UserWarning: Path marker shapes currently not handled, defaulting to Circle warnings.warn("Path marker shapes currently not handled, defaulting to Circle") /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/matplotlib/artist.py:224: MatplotlibDeprecationWarning: get_axes has been deprecated in mpl 1.5, please use the axes property. A removal date has not been set. stacklevel=1) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:295: UserWarning: Path marker sizes support is limited and may not display as expected warnings.warn("Path marker sizes support is limited and may not display as expected")
irisDf['YEAR'] = pd.to_numeric(irisDf['YEAR'], errors='coerce')
analyze.dist_analyze(irisDf, 'YEAR')
Variance of YEAR 58.012606326 Skewness of YEAR -2.48720150025
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:262: UserWarning: Path marker shapes currently not handled, defaulting to Circle warnings.warn("Path marker shapes currently not handled, defaulting to Circle") /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/matplotlib/artist.py:224: MatplotlibDeprecationWarning: get_axes has been deprecated in mpl 1.5, please use the axes property. A removal date has not been set. stacklevel=1) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:295: UserWarning: Path marker sizes support is limited and may not display as expected warnings.warn("Path marker sizes support is limited and may not display as expected")
analyze.dist_analyze(irisDf, 'GAUL_AD0')
Variance of GAUL_AD0 38146865433.4 Skewness of GAUL_AD0 4.77187706336
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:262: UserWarning: Path marker shapes currently not handled, defaulting to Circle warnings.warn("Path marker shapes currently not handled, defaulting to Circle") /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/matplotlib/artist.py:224: MatplotlibDeprecationWarning: get_axes has been deprecated in mpl 1.5, please use the axes property. A removal date has not been set. stacklevel=1) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:295: UserWarning: Path marker sizes support is limited and may not display as expected warnings.warn("Path marker sizes support is limited and may not display as expected")
def impute_values(df):
# Forcing that one record that uses a year range to NaN.probably should just drop the record.
df['YEAR'] = pd.to_numeric(df['YEAR'], errors='coerce')
# Picking randomly from one of top 3 modes
df['YEAR'].fillna(random.choice([2004, 2005,2013]), inplace=True)
# introduction new category for nan vals(other two are almost equally distributed)
df['VECTOR'].fillna('NA', inplace=True)
# Imputing SOURCE_TYPE with most popular entry.. almost 5x unpublished entries
df['SOURCE_TYPE'].fillna('unpublished', inplace=True)
# X and Y both are atleast bi-modal distributions with some outliers. So filling NA with mode values
df['X'].fillna(df['X'].mode(), inplace=True)
df['Y'].fillna(df['Y'].mode(), inplace=True)
#Imputing location type with most popular entry.
df['LOCATION_TYPE'].fillna('point', inplace=True)
# introduction new category for nan vals(other three are almost equally distributed)
df['infection_source'].fillna('NA', inplace=True)
# imputing polygon admin with most popular -999
df['POLYGON_ADMIN'].fillna('-999', inplace=True)
# And as it would be there's one COUNTRY without a COUNTRY_ID.. most likely a spelling but can
# circle back to fixing it when it's time to improve model performance(also can eliminate one of
# country/country_id)
df['COUNTRY'].fillna('Taiwan', inplace=True)
df['COUNTRY_ID'].fillna('TWN', inplace=True)
# STATUS is dominated by E values
df['STATUS'].fillna('E', inplace=True)
# infection_time is almost uniformly distributed from 1 to 10
df['infection_time'].fillna(random.choice([1+x for x in range(10)]), inplace=True)
# GAUL_AD0 very skewed picking up mode
df['GAUL_AD0'].fillna(df['GAUL_AD0'].mode(), inplace=True)
impute_values(irisDf)
new_df = pd.DataFrame()
new_df['year'] = irisDf.YEAR.unique()
new_df['occurrences'] = irisDf.groupby('YEAR').count().reset_index()['OCCURRENCE_ID']
analyze.correlation_analyze(new_df)
# Correlation btw Numerical Columns
# Pandas correlation coefficients matrix year occurrences year 1.000000 0.582886 occurrences 0.582886 1.000000 # Pandas co-variance coefficients matrix year occurrences year 267.017857 1.539969e+04 occurrences 15399.694156 2.614062e+06
analyze.silhouette_analyze(new_df)
For clusters = 2 The average silhouette_score is : 0.853372251152
For clusters = 4 The average silhouette_score is : 0.862393864548
For clusters = 6 The average silhouette_score is : 0.870465287541
new_df['year']
0 1958.0 1 1960.0 2 1961.0 3 1962.0 4 1963.0 5 1964.0 6 1965.0 7 1966.0 8 1967.0 9 1968.0 10 1969.0 11 1972.0 12 1970.0 13 1971.0 14 1973.0 15 1974.0 16 1975.0 17 1976.0 18 1977.0 19 1978.0 20 1979.0 21 1980.0 22 1981.0 23 1982.0 24 1983.0 25 1984.0 26 1985.0 27 1986.0 28 1988.0 29 1987.0 30 1989.0 31 1990.0 32 1991.0 33 1992.0 34 1995.0 35 1993.0 36 1994.0 37 1998.0 38 1996.0 39 1997.0 40 1999.0 41 2000.0 42 2001.0 43 2002.0 44 2003.0 45 2004.0 46 2005.0 47 2006.0 48 2007.0 49 2008.0 50 2009.0 51 2011.0 52 2010.0 53 2012.0 54 2013.0 55 2014.0 Name: year, dtype: float64
dec_df = pd.DataFrame()
decades = ['%d - %ds'%(x-10, x) for x in range(1960,2030, 10)]
end_years = [x for x in range(1960,2030,10)]
occurrences=[]
i_counts = [sum(new_df[(new_df.year < x) & (new_df.year >=x-10)].occurrences) for x in range(1960,2030,10)]
dec_df['decades'] = decades
dec_df['infections'] = i_counts
dec_df['end_years'] = end_years
dec_df['log_infections'] = dec_df['infections'].apply(lambda x: np.log(x))
dec_df
decades | infections | end_years | log_infections | |
---|---|---|---|---|
0 | 1950 - 1960s | 1 | 1960 | 0.000000 |
1 | 1960 - 1970s | 293 | 1970 | 5.680173 |
2 | 1970 - 1980s | 477 | 1980 | 6.167516 |
3 | 1980 - 1990s | 684 | 1990 | 6.527958 |
4 | 1990 - 2000s | 2455 | 2000 | 7.805882 |
5 | 2000 - 2010s | 20345 | 2010 | 9.920590 |
6 | 2010 - 2020s | 17811 | 2020 | 9.787572 |
from scipy.optimize import curve_fit
def func(x, m, b):
return m*x + b
popt, pcov = curve_fit(func, dec_df['end_years'], dec_df['log_infections'])
print(popt)
[ 1.41006842e-01 -2.74047946e+02]
def predictor(end_year):
return np.exp2(popt[0]*end_year + popt[1])
print(predictor(2030))
print(predictor(2040))
4691.85864328 12468.6033719
irs
grouped_by_cnt_year = irisDf.groupby(['YEAR','COUNTRY']).count().reset_index()
drop_cols =['VECTOR', 'OCCURRENCE_ID','SOURCE_TYPE','LOCATION_TYPE', 'POLYGON_ADMIN','Y','X','COUNTRY_ID',
'GAUL_AD0','STATUS','infection_source', 'infection_time']
grouped_by_cnt_year.drop(drop_cols,1, inplace=True)
analyze.dist_analyze(grouped_by_cnt_year)
Variance of YEAR 195.973836144 Skewness of YEAR -0.746475346502 Variance of Unnamed: 0 116266.443204 Skewness of Unnamed: 0 12.6221236242 Too many categorise for col: COUNTRY can't plot pie-chart
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:262: UserWarning: Path marker shapes currently not handled, defaulting to Circle warnings.warn("Path marker shapes currently not handled, defaulting to Circle") /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/matplotlib/artist.py:224: MatplotlibDeprecationWarning: get_axes has been deprecated in mpl 1.5, please use the axes property. A removal date has not been set. stacklevel=1) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:295: UserWarning: Path marker sizes support is limited and may not display as expected warnings.warn("Path marker sizes support is limited and may not display as expected")
grouped_by_cnt = irisDf.groupby('COUNTRY').count().reset_index()
drop_cols =['VECTOR', 'OCCURRENCE_ID','SOURCE_TYPE','YEAR', 'LOCATION_TYPE', 'POLYGON_ADMIN','Y','X','COUNTRY_ID',
'GAUL_AD0','STATUS','infection_source', 'infection_time']
grouped_by_cnt.drop(drop_cols,1, inplace=True)
analyze.dist_analyze(grouped_by_cnt)
Variance of Unnamed: 0 4566452.56945 Skewness of Unnamed: 0 10.6756330165 Too many categorise for col: COUNTRY can't plot pie-chart
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:262: UserWarning: Path marker shapes currently not handled, defaulting to Circle warnings.warn("Path marker shapes currently not handled, defaulting to Circle") /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/matplotlib/artist.py:224: MatplotlibDeprecationWarning: get_axes has been deprecated in mpl 1.5, please use the axes property. A removal date has not been set. stacklevel=1) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:295: UserWarning: Path marker sizes support is limited and may not display as expected warnings.warn("Path marker sizes support is limited and may not display as expected")
grouped_by_cnt['Unnamed: 0'].rename('country_cnts', inplace=True)
0 1 1 22 2 1 3 12 4 1 5 17 6 6 7 173 8 2 9 298 10 1 11 6 12 21 13 1 14 1 15 2 16 3 17 1 18 2 19 33 20 1 21 8501 22 2 23 5 24 1 25 17 26 41 27 35 28 97 29 1 30 21 31 13 32 142 33 131 34 8 35 9 36 2 37 18 38 8 39 180 40 1 41 2 42 4 43 6 44 7 45 5 46 27 47 37 48 48 49 70 50 56 51 16 52 2 53 3 54 12 55 2 56 18 57 14 58 6 59 1 60 25 61 17 62 16 63 17 64 8 65 3 66 583 67 768 68 20 69 209 70 11 71 109 72 53 73 3 74 8 75 29 76 16 77 1 78 1 79 90 80 303 81 2 82 1 83 3 84 44 85 13 86 41 87 465 88 2 89 2 90 1 91 24 92 2 93 6 94 7 95 26 96 1 97 14 98 47 99 1 100 2 101 32 102 40 103 13 104 19 105 89 106 60 107 124 108 52 109 1 110 1 111 2 112 1 113 6 114 12 115 8 116 112 117 1 118 5 119 21 120 90 121 1 122 4 123 3 124 4 125 21 126 11 127 43 128 6 129 3 130 1 131 24864 132 584 133 3 134 3 135 5 136 159 137 4 138 3 139 17 140 46 141 2044 142 8 143 5 144 143 145 243 146 1 147 1 148 3 149 1 150 2 Name: country_cnts, dtype: int64
filtered_countries = grouped_by_cnt[grouped_by_cnt['Unnamed: 0']>8.0]['COUNTRY']
print(list(filtered_countries))
sorted(grouped_by_cnt_year['COUNTRY'].unique())
['Albania', 'American Samoa', 'Anguilla', 'Argentina', 'Australia', 'Bangladesh', 'Bolivia', 'Brazil', 'Burkina Faso', "C?te d'Ivoire", 'Cambodia', 'Cameroon', 'Cayman Islands', 'Central African Republic', 'China', 'Colombia', 'Congo', 'Costa Rica', 'Cuba', 'Fiji', 'France', 'French Guiana', 'French Polynesia', 'Gabon', 'Gambia', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guyana', 'Haiti', 'India', 'Indonesia', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Kenya', "Lao People's Democratic Republic", 'Lebanon', 'Madagascar', 'Malaysia', 'Martinique', 'Mauritius', 'Mayotte', 'Mexico', 'Myanmar', 'New Caledonia', 'Nicaragua', 'Nigeria', 'Pakistan', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Puerto Rico', 'R?union', 'Samoa', 'Senegal', 'Sierra Leone', 'Singapore', 'South Africa', 'Spain', 'Sri Lanka', 'Taiwan', 'Thailand', 'Trinidad and Tobago', 'Uganda', 'United Republic of Tanzania', 'United States of America', 'Venezuela', 'Viet Nam']
['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Angola', 'Anguilla', 'Antigua and Barbuda', 'Argentina', 'Aruba', 'Australia', 'Austria', 'Bahamas', 'Bangladesh', 'Barbados', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Brazil', 'British Virgin Islands', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', "C?te d'Ivoire", 'Cambodia', 'Cameroon', 'Canada', 'Cayman Islands', 'Central African Republic', 'China', 'Colombia', 'Comoros', 'Congo', 'Cook Islands', 'Costa Rica', 'Croatia', 'Cuba', 'Czech Republic', 'Democratic Republic of the Congo', 'Djibouti', 'Dominica', 'Dominican Republic', 'El Salvador', 'Fiji', 'France', 'French Guiana', 'French Polynesia', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Glorioso Island', 'Greece', 'Grenada', 'Guadeloupe', 'Guam', 'Guatemala', 'Guinea', 'Guyana', 'Haiti', 'Honduras', 'Hong Kong', 'India', 'Indonesia', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Kenya', 'Kiribati', 'Korea, Republic of', "Lao People's Democratic Republic", 'Lebanon', 'Liberia', 'Macau', 'Madagascar', 'Malaysia', 'Maldives', 'Malta', 'Marshall Islands', 'Martinique', 'Mauritius', 'Mayotte', 'Mexico', 'Montenegro', 'Montserrat', 'Mozambique', 'Myanmar', 'Nauru', 'Nepal', 'Netherlands', 'New Caledonia', 'New Zealand', 'Nicaragua', 'Nigeria', 'Niue', 'Northern Mariana Islands', 'Pakistan', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Puerto Rico', 'R?union', 'Romania', 'Russian Federation', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Vincent and the Grenadines', 'Samoa', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'Switzerland', 'Taiwan', 'Thailand', 'Timor-Leste', 'Tokelau', 'Tonga', 'Trinidad and Tobago', 'Turkey', 'Tuvalu', 'Uganda', 'United Republic of Tanzania', 'United States of America', 'Uruguay', 'Vanuatu', 'Venezuela', 'Viet Nam', 'Wallis and Futuna', 'West Bank', 'Yemen', 'Zambia', 'Zimbabwe']
temp_df = pd.DataFrame(columns=['YEAR', 'COUNTRY', 'COUNT'])
for each in list(filtered_countries):
temp_df = temp_df.append(grouped_by_cnt_year[grouped_by_cnt_year.COUNTRY==each])
#new_rows = list()
#for row in grouped_by_cnt_year.iterrows():
# print(list(row))
# if row[1][0] in list(filtered_countries):
# new_rows.append(row)
#
temp_df.head()
COUNT | COUNTRY | Unnamed: 0 | YEAR | |
---|---|---|---|---|
157 | NaN | Albania | 14.0 | 1979.0 |
332 | NaN | Albania | 1.0 | 1992.0 |
403 | NaN | Albania | 1.0 | 1996.0 |
529 | NaN | Albania | 1.0 | 2001.0 |
669 | NaN | Albania | 5.0 | 2006.0 |
temp_df.rename(columns={'Unnamed: 0': 'count'},inplace=True)
temp_df.drop('COUNT', 1, inplace=True)
yearly_sum_country = temp_df.groupby(['COUNTRY', 'YEAR']).sum().reset_index()
country_avg_rate = dict()
for each in filtered_countries:
country_avg_rate[each] = abs(yearly_sum_country[yearly_sum_country.COUNTRY == each]['YEAR'].min() - \
yearly_sum_country[yearly_sum_country.COUNTRY == each]['YEAR'].max())/\
len(yearly_sum_country[yearly_sum_country.COUNTRY==each]['YEAR'])
import pprint
import operator
sorted_data = sorted(country_avg_rate.items(), key=operator.itemgetter(1),reverse=True)
pprint.pprint(sorted_data)
[('Guyana', 25.0), ('Congo', 17.666666666666668), ('Gambia', 17.0), ('Jamaica', 16.666666666666668), ('Mauritius', 14.333333333333334), ('Haiti', 13.333333333333334), ('Papua New Guinea', 12.333333333333334), ('Ghana', 10.6), ('Anguilla', 10.333333333333334), ('Cambodia', 9.5999999999999996), ('United Republic of Tanzania', 9.0), ('American Samoa', 9.0), ('Israel', 7.5999999999999996), ('Uganda', 7.5714285714285712), ('Pakistan', 7.4285714285714288), ('Central African Republic', 6.5), ('South Africa', 6.4000000000000004), ('Burkina Faso', 6.2857142857142856), ('Cameroon', 6.2857142857142856), ('Grenada', 5.666666666666667), ('Albania', 5.4000000000000004), ('Panama', 4.875), ('Guatemala', 4.5), ('Bangladesh', 4.4285714285714288), ("C?te d'Ivoire", 4.0), ('Myanmar', 3.8999999999999999), ('Fiji', 3.8999999999999999), ('Nigeria', 3.7857142857142856), ('New Caledonia', 3.7272727272727271), ('Martinique', 3.7142857142857144), ('Philippines', 3.3846153846153846), ('Kenya', 3.2352941176470589), ('French Polynesia', 3.2000000000000002), ('Gabon', 3.2000000000000002), ('Nicaragua', 3.1666666666666665), ('Singapore', 3.0666666666666669), ('R?union', 3.0), ('Venezuela', 2.9444444444444446), ('Colombia', 2.7894736842105261), ('Costa Rica', 2.6666666666666665), ('Japan', 2.6470588235294117), ('French Guiana', 2.6363636363636362), ('Madagascar', 2.6000000000000001), ('Senegal', 2.5555555555555554), ('Sri Lanka', 2.5384615384615383), ('Viet Nam', 2.4736842105263159), ('Bolivia', 2.2727272727272729), ('Lebanon', 2.25), ('Spain', 2.25), ('France', 2.1666666666666665), ('Samoa', 2.0), ('Puerto Rico', 1.8148148148148149), ('Australia', 1.7777777777777777), ('Paraguay', 1.75), ('Taiwan', 1.6842105263157894), ('China', 1.6399999999999999), ('Malaysia', 1.5357142857142858), ('Peru', 1.5333333333333334), ('Guinea', 1.5), ('Thailand', 1.4411764705882353), ('Cuba', 1.3913043478260869), ('Cayman Islands', 1.375), ('Indonesia', 1.2777777777777777), ('Mexico', 1.2692307692307692), ('Mayotte', 1.25), ('Trinidad and Tobago', 1.2222222222222223), ('India', 1.1951219512195121), ('United States of America', 1.173913043478261), ('Italy', 1.1499999999999999), ('Brazil', 1.1290322580645162), ("Lao People's Democratic Republic", 1.0), ('Greece', 1.0), ('Argentina', 1.0), ('Sierra Leone', 0.0)]
yearly_sum_country.head()
COUNTRY | YEAR | count | |
---|---|---|---|
0 | Albania | 1979.0 | 14.0 |
1 | Albania | 1992.0 | 1.0 |
2 | Albania | 1996.0 | 1.0 |
3 | Albania | 2001.0 | 1.0 |
4 | Albania | 2006.0 | 5.0 |
top_countries = [k for k,v in sorted_data[:5]]
print(top_countries)
temp_df = pd.DataFrame()
for each in top_countries:
temp_df = temp_df.append(grouped_by_cnt_year[grouped_by_cnt_year.COUNTRY==each])
plotter.show(plotter.lineplot(temp_df[temp_df.COUNTRY==each], 'YEAR', 'Unnamed: 0', title=each))
['Guyana', 'Congo', 'Gambia', 'Jamaica', 'Mauritius']