%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import pandas as pd
import ipywidgets as widgets
Modify the code below such that income and consumption are variables in the dt DataFrame.
np.random.seed(1999)
N = 100
mydata = {}
mydata['id'] = range(N)
income = np.exp(np.random.normal(size=N))
consumption = np.sqrt(income)
dt = pd.DataFrame(mydata)
dt.head()
id | |
---|---|
0 | 0 |
1 | 1 |
2 | 2 |
3 | 3 |
4 | 4 |
Answer:
np.random.seed(1999)
N = 100
mydata = {}
mydata['id'] = range(N)
mydata['income'] = np.exp(np.random.normal(size=N))
mydata['consumption'] = np.sqrt(mydata['income'])
dt_true = pd.DataFrame(mydata)
dt_true.head()
id | income | consumption | |
---|---|---|---|
0 | 0 | 0.727981 | 0.853218 |
1 | 1 | 1.997831 | 1.413447 |
2 | 2 | 0.276823 | 0.526140 |
3 | 3 | 1.481931 | 1.217346 |
4 | 4 | 1.235904 | 1.111712 |
Add a new variable ratio which is the ratio of consumption to income.
# write your code here
dt.head()
id | |
---|---|
0 | 0 |
1 | 1 |
2 | 2 |
3 | 3 |
4 | 4 |
Answer:
dt_true['ratio'] = dt_true['consumption']/dt_true['income']
dt_true.head()
id | income | consumption | ratio | |
---|---|---|---|---|
0 | 0 | 0.727981 | 0.853218 | 1.172033 |
1 | 1 | 1.997831 | 1.413447 | 0.707490 |
2 | 2 | 0.276823 | 0.526140 | 1.900636 |
3 | 3 | 1.481931 | 1.217346 | 0.821459 |
4 | 4 | 1.235904 | 1.111712 | 0.899513 |
Produce summary statistics using .describe()
.
# write your code here
Answer:
dt_true.describe()
id | income | consumption | ratio | |
---|---|---|---|---|
count | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
mean | 49.500000 | 1.415547 | 1.087844 | 1.117517 |
std | 29.011492 | 1.322203 | 0.484238 | 0.525452 |
min | 0.000000 | 0.108402 | 0.329245 | 0.351134 |
25% | 24.750000 | 0.529323 | 0.727545 | 0.752310 |
50% | 49.500000 | 0.981178 | 0.990533 | 1.009580 |
75% | 74.250000 | 1.768211 | 1.329572 | 1.374491 |
max | 99.000000 | 8.110612 | 2.847914 | 3.037255 |
Select everybody with an income above 1.
# write your code here
dt.head()
id | |
---|---|
0 | 0 |
1 | 1 |
2 | 2 |
3 | 3 |
4 | 4 |
Answer:
I = dt_true['income'] > 1
dt_true.loc[I, :].head()
id | income | consumption | ratio | |
---|---|---|---|---|
1 | 1 | 1.997831 | 1.413447 | 0.707490 |
3 | 3 | 1.481931 | 1.217346 | 0.821459 |
4 | 4 | 1.235904 | 1.111712 | 0.899513 |
6 | 6 | 2.574032 | 1.604379 | 0.623294 |
7 | 7 | 2.475478 | 1.573365 | 0.635580 |
Select everybody with an income above 1 and a ratio above 0.7.
# write your code here
Answer:
I = (dt_true['income'] > 1) & (dt_true['ratio'] > 0.7)
dt_true.loc[I].head()
id | income | consumption | ratio | |
---|---|---|---|---|
1 | 1 | 1.997831 | 1.413447 | 0.707490 |
3 | 3 | 1.481931 | 1.217346 | 0.821459 |
4 | 4 | 1.235904 | 1.111712 | 0.899513 |
11 | 11 | 2.031708 | 1.425380 | 0.701567 |
18 | 18 | 1.280235 | 1.131475 | 0.883802 |
Set consumption equal to 0.5 if income is less than 0.5.
# write your code here
# dt['consumption'].mean() # <- compare with answer
Answer:
I = (dt_true['income'] < 0.5)
dt_true.loc[I,['consumption']] = 0.5
dt_true['consumption'].mean()
1.075479712048503
Set consumption equal to income if income is less than 0.5.
# write your code here
# dt['consumption'].mean() # <- compare with answer
Answer:
I = (dt_true['income'] < 0.5)
dt_true.loc[I,['consumption']] = dt_true.loc[I,['income']].values
dt_true['consumption'].mean()
1.0337728690050052
Drop the ratio variable and all rows with an income above 1.5. After this, also drop the first 5 rows.
print(f'before: {dt.shape[0]} observations, {dt.shape[1]} variables')
# write your code here
print(f'after: {dt.shape[0]} observations, {dt.shape[1]} variables')
before: 100 observations, 1 variables after: 100 observations, 1 variables
Answer:
dt_alt = dt_true.copy()
print(f'before: {dt_true.shape[0]} observations, {dt_true.shape[1]} variables')
dt_true.drop('ratio',axis=1,inplace=True)
I = dt_true['income'] > 1.5
dt_true.drop(dt_true[I].index,inplace=True)
dt_true.drop(dt_true.loc[:5].index,inplace=True)
print(f'after: {dt_true.shape[0]} observations, {dt_true.shape[1]} variables')
# alternative: keep where I is false
del dt_alt['ratio']
I = dt_alt['income'] > 1.5
dt_alt = dt_alt[~I]
dt_alt = dt_alt.iloc[5:,:]
print(f'after (alt): {dt_alt.shape[0]} observations, {dt_alt.shape[1]} variables')
before: 100 observations, 4 variables after: 65 observations, 3 variables after (alt): 65 observations, 3 variables
Rename consumption to cons and income to inc.
# write your code
dt.head()
id | |
---|---|
0 | 0 |
1 | 1 |
2 | 2 |
3 | 3 |
4 | 4 |
Answer:
dt_true.rename(columns={'income':'inc','consumption':'con'},inplace=True)
dt_true.head()
id | inc | con | |
---|---|---|---|
8 | 8 | 0.582074 | 0.762938 |
10 | 10 | 0.932044 | 0.965424 |
12 | 12 | 0.356952 | 0.356952 |
13 | 13 | 0.379825 | 0.379825 |
16 | 16 | 0.700896 | 0.837195 |
Correct the wrong lines such that assets_1 = assets_2 = assets_3 = assets_4
.
def assets_row_by_row(x,R,Y):
return 0 # wrong line
def assets_all_at_once(income,consumption,R,Y):
return 0 # wrong line
def assets_adj(assets,R,Y):
# missing lines
pass
R = 1.2 # return rate
Y = 1 # income
try:
dt['assets_1'] = R*(dt['inc']-dt['con'])+Y
dt['assets_2'] = dt.apply(assets_row_by_row,axis=1,args=(R,Y))
dt['assets_3'] = assets_all_at_once(dt['inc'].values(),dt['con'].values(),R,Y)
dt['assets_4'] = dt['inc']-dt['con']
assets_adj(dt['assets_4'],R,Y)
except:
print('failed')
dt.head()
failed
id | |
---|---|
0 | 0 |
1 | 1 |
2 | 2 |
3 | 3 |
4 | 4 |
Answer:
def assets_row_by_row(x,R,Y):
return R*(x['inc']-x['con'])+Y
def assets_all_at_once(income,consumption,R,Y):
return R*(income-consumption)+Y
def assets_adj(assets,R,Y):
assets *= R
assets += Y
R = 1.2 # return rate
Y = 1 # income
dt_true['assets_1'] = R*(dt_true['inc']-dt_true['con'])+Y
dt_true['assets_2'] = dt_true.apply(assets_row_by_row,axis=1,args=(R,Y))
dt_true['assets_3'] = assets_all_at_once(dt_true['inc'].values,dt_true['con'].values,R,Y)
dt_true['assets_4'] = dt_true['inc']-dt_true['con']
assets_adj(dt_true['assets_4'],R,Y)
dt_true.head()
id | inc | con | assets_1 | assets_2 | assets_3 | assets_4 | |
---|---|---|---|---|---|---|---|
8 | 8 | 0.582074 | 0.762938 | 0.782963 | 0.782963 | 0.782963 | 0.782963 |
10 | 10 | 0.932044 | 0.965424 | 0.959943 | 0.959943 | 0.959943 | 0.959943 |
12 | 12 | 0.356952 | 0.356952 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
13 | 13 | 0.379825 | 0.379825 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
16 | 16 | 0.700896 | 0.837195 | 0.836441 | 0.836441 | 0.836441 | 0.836441 |
Load the data set in data/NAH1_pivoted.xlsx and clean and structure it such that the plot_timeseries(dataframe)
below can be run and produce an interactive figure.
def _plot_timeseries(dataframe, variable, years):
fig = plt.figure(dpi=100)
ax = fig.add_subplot(1,1,1)
dataframe.loc[:,['year']] = pd.to_numeric(dataframe['year'])
I = (dataframe['year'] >= years[0]) & (dataframe['year'] <= years[1])
x = dataframe.loc[I,'year']
y = dataframe.loc[I,variable]
ax.plot(x,y)
ax.set_xticks(list(range(years[0], years[1] + 1, 5)))
def plot_timeseries(dataframe):
widgets.interact(_plot_timeseries,
dataframe = widgets.fixed(dataframe),
variable = widgets.Dropdown(
description='variable',
options=['Y','C','G','I','X','M'],
value='Y'),
years=widgets.IntRangeSlider(
description="years",
min=1966,
max=2018,
value=[1980, 2018],
continuous_update=False,
)
);
Hint 1: You can base your renaming on this dictionary:
rename_dict = {}
rename_dict['P.1 Output'] = 'Y'
rename_dict['P.3 Final consumption expenditure'] = 'C'
rename_dict['P.3 Government consumption expenditure'] = 'G'
rename_dict['P.5 Gross capital formation'] = 'I'
rename_dict['P.6 Export of goods and services'] = 'X'
rename_dict['P.7 Import of goods and services'] = 'M'
Hint 2: You code should have the following structure:
# a. load data set
# nah1 = ?
# b. rename variables
# c. remove rows where Y is nan
# d. correct year column data
# hint, nah1.loc[J,['year']] = nah1.loc[I,['year']].values
# e. only keep rows with '2010-prices, chained values'
# f. only keep renamed variables
# g. interactive plot
# plot_timeseries(nan)
Answer:
# a. load data set
nah1 = pd.read_excel('data/NAH1_pivoted.xlsx',skiprows=2)
# b. rename variables
rename_dict['Unnamed: 0'] = 'year'
nah1.rename(columns=rename_dict,inplace=True)
# c. remove rows where Y is nan
I = nah1['Y'].notna()
nah1 = nah1[I]
# d. correct year column data
I = nah1['year'].notna()
J = nah1['year'].isna()
nah1.loc[J,['year']] = nah1.loc[I,['year']].values
# e. only keep rows with '2010-prices, chained values'
I = nah1['Unnamed: 1'] == '2010-prices, chained values'
nah1 = nah1[I]
# f. only keep renamed variables
nah1 = nah1.loc[:,['year','Y','C','G','I','X','M']]
nah1
year | Y | C | G | I | X | M | |
---|---|---|---|---|---|---|---|
1 | 1966 | 1089347.0 | 549491.0 | 171307.0 | 127631.0 | 123594.0 | 112171.0 |
3 | 1967 | 1124579.0 | 588094.0 | 184983.0 | 133749.0 | 128182.0 | 120028.0 |
5 | 1968 | 1162778.0 | 612327.0 | 195664.0 | 142242.0 | 140746.0 | 127670.0 |
7 | 1969 | 1239121.0 | 649337.0 | 209375.0 | 164447.0 | 149184.0 | 144021.0 |
9 | 1970 | 1282852.0 | 671805.0 | 222350.0 | 168129.0 | 154880.0 | 157151.0 |
11 | 1971 | 1315198.0 | 687221.0 | 235965.0 | 167979.0 | 164685.0 | 159437.0 |
13 | 1972 | 1375211.0 | 696409.0 | 246412.0 | 179858.0 | 173351.0 | 161273.0 |
15 | 1973 | 1433881.0 | 736943.0 | 250919.0 | 195814.0 | 187836.0 | 189976.0 |
17 | 1974 | 1420321.0 | 728730.0 | 255025.0 | 182647.0 | 194250.0 | 186142.0 |
19 | 1975 | 1384905.0 | 742587.0 | 252814.0 | 152977.0 | 192843.0 | 176903.0 |
21 | 1976 | 1481439.0 | 791533.0 | 263746.0 | 185880.0 | 199459.0 | 206589.0 |
23 | 1977 | 1511387.0 | 806350.0 | 268008.0 | 183381.0 | 206612.0 | 208364.0 |
25 | 1978 | 1553317.0 | 831830.0 | 285396.0 | 180665.0 | 209496.0 | 210385.0 |
27 | 1979 | 1620148.0 | 851321.0 | 300440.0 | 188518.0 | 232309.0 | 224407.0 |
29 | 1980 | 1615002.0 | 846984.0 | 310237.0 | 160224.0 | 245273.0 | 211920.0 |
31 | 1981 | 1594442.0 | 847062.0 | 318344.0 | 133904.0 | 266517.0 | 213097.0 |
33 | 1982 | 1627760.0 | 864265.0 | 328757.0 | 149436.0 | 274952.0 | 219781.0 |
35 | 1983 | 1654323.0 | 874955.0 | 328561.0 | 154643.0 | 287589.0 | 224112.0 |
37 | 1984 | 1740838.0 | 890327.0 | 323091.0 | 183220.0 | 297074.0 | 236128.0 |
39 | 1985 | 1825992.0 | 923762.0 | 330263.0 | 203955.0 | 315007.0 | 259576.0 |
41 | 1986 | 1938199.0 | 970914.0 | 332830.0 | 236282.0 | 319226.0 | 281585.0 |
43 | 1987 | 1960401.0 | 959976.0 | 334893.0 | 227970.0 | 334732.0 | 278267.0 |
45 | 1988 | 1983427.0 | 947774.0 | 334632.0 | 220781.0 | 365333.0 | 290022.0 |
47 | 1989 | 1994879.0 | 947038.0 | 331650.0 | 228800.0 | 382350.0 | 305677.0 |
49 | 1990 | 2032441.0 | 947408.0 | 326540.0 | 227572.0 | 407305.0 | 312939.0 |
51 | 1991 | 2049177.0 | 962895.0 | 331961.0 | 218680.0 | 432431.0 | 325555.0 |
53 | 1992 | 2080460.0 | 981584.0 | 334136.0 | 222853.0 | 433602.0 | 325129.0 |
55 | 1993 | 2108104.0 | 989510.0 | 348366.0 | 205585.0 | 438932.0 | 320528.0 |
57 | 1994 | 2210638.0 | 1040029.0 | 355506.0 | 233340.0 | 475081.0 | 363124.0 |
59 | 1995 | 2285325.0 | 1059095.0 | 363169.0 | 267590.0 | 488777.0 | 388553.0 |
61 | 1996 | 2352072.0 | 1087124.0 | 374242.0 | 270410.0 | 511506.0 | 400680.0 |
63 | 1997 | 2425112.0 | 1110554.0 | 376151.0 | 309371.0 | 534460.0 | 437574.0 |
65 | 1998 | 2495172.0 | 1139300.0 | 387763.0 | 327558.0 | 556380.0 | 470721.0 |
67 | 1999 | 2598080.0 | 1150485.0 | 400446.0 | 311579.0 | 619082.0 | 482738.0 |
69 | 2000 | 2727229.0 | 1164693.0 | 412188.0 | 348171.0 | 696948.0 | 548713.0 |
71 | 2001 | 2792144.0 | 1173920.0 | 420034.0 | 343040.0 | 720315.0 | 561855.0 |
73 | 2002 | 2820514.0 | 1193647.0 | 428984.0 | 338918.0 | 751704.0 | 597680.0 |
75 | 2003 | 2826381.0 | 1205061.0 | 430025.0 | 336613.0 | 742669.0 | 591557.0 |
77 | 2004 | 2893370.0 | 1248028.0 | 436581.0 | 359370.0 | 765061.0 | 633775.0 |
79 | 2005 | 3042633.0 | 1283510.0 | 441847.0 | 379250.0 | 824193.0 | 705349.0 |
81 | 2006 | 3212095.0 | 1319567.0 | 453065.0 | 427665.0 | 909395.0 | 803852.0 |
83 | 2007 | 3304041.0 | 1340375.0 | 458504.0 | 439093.0 | 942620.0 | 850781.0 |
85 | 2008 | 3333338.0 | 1359497.0 | 473421.0 | 418030.0 | 979118.0 | 891365.0 |
87 | 2009 | 3114274.0 | 1343243.0 | 487752.0 | 329277.0 | 888801.0 | 784904.0 |
89 | 2010 | 3120573.0 | 1357797.0 | 495575.0 | 327344.0 | 914933.0 | 789148.0 |
91 | 2011 | 3198587.0 | 1357064.0 | 492464.0 | 345198.0 | 980761.0 | 847889.0 |
93 | 2012 | 3225817.0 | 1365205.0 | 496195.0 | 353120.0 | 992159.0 | 870858.0 |
95 | 2013 | 3219252.0 | 1367181.0 | 495701.0 | 365093.0 | 1008147.0 | 883665.0 |
97 | 2014 | 3276865.0 | 1384585.0 | 505299.0 | 380755.0 | 1039691.0 | 918099.0 |
99 | 2015 | 3368200.0 | 1413235.0 | 513801.0 | 400705.0 | 1076884.0 | 959972.0 |
101 | 2016 | 3454769.0 | 1433261.0 | 514584.0 | 425666.0 | 1118549.0 | 1000599.0 |
103 | 2017 | 3533677.0 | 1456463.0 | 518315.0 | 442594.0 | 1158517.0 | 1036219.0 |
105 | 2018 | 3570475.0 | 1481042.0 | 521123.0 | 465359.0 | 1164099.0 | 1065876.0 |
plot_timeseries(nah1)
interactive(children=(Dropdown(description='variable', options=('Y', 'C', 'G', 'I', 'X', 'M'), value='Y'), Int…
Extend the interactive plot with a choice of real vs nominal.
Load data from an Excel or CSV file you have downloaded from e.g. Statistikbanken.dk. Clean, structure and present the data as you see fit.