import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
url = "https://en.wikipedia.org/wiki/List_of_German_states_by_GDP"
Content of the Wikipedia page:
NOTE! List for 2014 is missing (Missing data is a very common case in practice)
# requirements: https://stackoverflow.com/questions/34555135/pandas-read-html
# pd.read_html(url)[year_table_map[2015]]
year_table_map = {
2015:3,
2013:4,
2012:5,
2011:6,
2010:7,
2009:8,
2008:9
}
def calculate_population(gdp,gdp_pc):
return int((gdp*1000000000)/gdp_pc)
def get_population():
global year_table_map
df = pd.read_html(url)
# print(df)
df = pd.read_html(url)[year_table_map[2015]]
# print(df)
df.columns = ["STATE", "RANK", "GDP_EUR", "GDP_USD", "GDP_PC"]
df = df.drop(["RANK","GDP_USD"],axis=1)
df = df[~df["STATE"].isin(["States","Germany","Former GDR states and all of Berlin"])]
df[["GDP_EUR","GDP_PC"]] = df[["GDP_EUR", "GDP_PC"]].apply(pd.to_numeric)
df["POPULATION"] = df.apply(lambda x: calculate_population(x["GDP_EUR"], x["GDP_PC"]), axis=1)
df = df[["STATE", "POPULATION"]]
return df
POPULATION = get_population()
def population_for_state(state):
global POPULATION
return POPULATION[POPULATION["STATE"] == state]["POPULATION"].values[0]
def add_per_capita(x):
global POPULATION
gdp = x["GDP_EUR"]
state = x["STATE"]
population = population_for_state(state)
return (float(gdp)*1000000000)/population
def plot_population(population_df):
ax = sns.barplot(population_df["POPULATION"],population_df["STATE"],estimator=abs)
plt.title("States by Population")
ax.set(xlabel='POPULATION', ylabel='STATE')
# plt.xticks(population_df["POPULATION"].valu)
def get_table_for_year(url, year, no_per_capita=True):
""" Returns datafame representation of the table on wikipedia page
"""
global year_table_map, POPULATION
if year not in year_table_map.keys():
return "There is no table for the input year on the Wikipedia page"
df = pd.read_html(url)[year_table_map[year]]
if year == 2015:
df.columns = ["STATE", "RANK", "GDP_EUR", "GDP_USD", "GDP_PC"]
df = df[~df["STATE"].isin(["States","Germany","Former GDR states and all of Berlin"])]
df[["RANK","GDP_EUR","GDP_USD","GDP_PC"]] = df[["RANK","GDP_EUR","GDP_USD","GDP_PC",]].apply(pd.to_numeric)
df = df.drop(["GDP_USD"], axis=1)
if no_per_capita:
df = df.drop(["GDP_PC", "GDP_USD"], axis=1)
else:
df.columns = ["STATE", "RANK", "GDP_EUR", "GDP_USD", "GDP_SHARE"]
df = df[~df["STATE"].isin(["States","Germany","Former GDR states and all of Berlin"])]
df[["RANK","GDP_EUR","GDP_USD"]] = df[["RANK","GDP_EUR","GDP_USD"]].apply(pd.to_numeric)
df = df.drop(["GDP_SHARE", "GDP_USD"], axis=1)
if not no_per_capita:
df["GDP_PC"] = df.apply(lambda x: add_per_capita(x), axis=1)
return df
def plot_gdp_per_capita(year):
df = get_table_for_year(url, 2015, no_per_capita=False)
ax = sns.barplot(df["GDP_PC"],df["STATE"],estimator=abs)
plt.title("States by GDP per capita")
ax.set(xlabel='GDP PER CAPITA', ylabel='STATE')
def merge_tables(years):
df_lst = []
for year in years:
df = get_table_for_year(url, year, no_per_capita=False)
df["YEAR"] = [year]*len(df)
# df = df
df_lst.append(df)
return pd.concat(df_lst).reset_index(drop=True)
def plot_gdp_over_time(all_tables, states=["Rhineland-Palatinate"]):
df = all_tables[all_tables["STATE"].isin(states)]
df['UNIT'] = ['EUR']*len(df)
sns.tsplot(df, time="YEAR",condition="STATE", unit="UNIT", value="GDP_EUR")
def get_stats(variable):
if variable == "GDP_EUR":
unit = "billion"
elif variable == "GDP_PC":
unit = "thousands"
else:
unit = ""
print("Average {}: {} ({})".format(variable,df[variable].mean(), unit))
print("Median {}: {} ({})".format(variable,df[variable].median(),unit))
print("Minimal {}: {} ({})".format(variable,df[variable].min(),unit))
print("Maximal {}: {} ({})".format(variable,df[variable].max(),unit))
As GDP per capita is given for year 2015 we can aproximate the population of each Bundesland and asume that the populations did not change dramaticaly in the meantime.
The population will be used to aproximate GDP PER CAPITA where it is missing
POPULATION = get_population()
POPULATION
STATE | POPULATION | |
---|---|---|
2 | North Rhine-Westphalia | 17683037 |
3 | Bavaria | 12744592 |
4 | Baden-Württemberg | 10777564 |
6 | Hesse | 6116221 |
7 | Lower Saxony | 7860443 |
8 | Rhineland-Palatinate | 4021179 |
9 | Berlin | 3485025 |
10 | Saxony | 4055947 |
11 | Hamburg | 1770172 |
12 | Schleswig-Holstein | 2840976 |
13 | Brandenburg | 2464575 |
14 | Thuringia | 2154870 |
15 | Saxony-Anhalt | 2231010 |
16 | Mecklenburg-Vorpommern | 1600586 |
17 | Saarland | 989240 |
18 | Bremen | 663613 |
population_for_state("Bavaria")
12744592
plot_population(POPULATION)
Now, lets download data from the web page. Python has a nice feature that make it easy to download tables from websites automaticaly. For this task, we need just one line of code. However, data usually needs to be procesed and cleaned. The columns we need are:
get_table_for_year(url, 2008,no_per_capita=True)
STATE | RANK | GDP_EUR | |
---|---|---|---|
2 | North Rhine-Westphalia | 1 | 547.54 |
3 | Bavaria | 2 | 444.81 |
4 | Baden-Württemberg | 3 | 365.06 |
5 | Hesse | 4 | 221.35 |
6 | Lower Saxony | 5 | 213.09 |
7 | Rhineland-Palatinate | 6 | 106.37 |
8 | Saxony | 7 | 94.92 |
9 | Berlin | 8 | 88.58 |
10 | Hamburg | 9 | 87.48 |
11 | Schleswig-Holstein | 10 | 73.94 |
12 | Brandenburg | 11 | 54.37 |
13 | Saxony-Anhalt | 12 | 53.72 |
14 | Thuringia | 13 | 50.38 |
15 | Mecklenburg-Vorpommern | 14 | 35.70 |
16 | Saarland | 15 | 31.05 |
17 | Bremen | 16 | 27.43 |
Looking at the page, we can see that GDP per capita is specified only for years 2015 and 2008. This information is rather useful, it is GDP divided by the population of a state. For example, India ranks 7th in the world by total GDP which means it is a very large economy, but when the GDP is dividide by the population of India takes the 122nd place in the world, which means it's citizens are not among the richer in todays standings.
But, if you remember, we have calculated the populations of all Federal states and this means we can aproximate GDP per capita for them! Just add a parameter no_per_capita to False.
get_table_for_year(url, 2015, no_per_capita=False)
STATE | RANK | GDP_EUR | GDP_PC | |
---|---|---|---|---|
2 | North Rhine-Westphalia | 1 | 645.590 | 36509 |
3 | Bavaria | 2 | 549.190 | 43092 |
4 | Baden-Württemberg | 3 | 460.687 | 42745 |
6 | Hesse | 4 | 263.444 | 43073 |
7 | Lower Saxony | 5 | 258.530 | 32890 |
8 | Rhineland-Palatinate | 6 | 131.951 | 32814 |
9 | Berlin | 7 | 124.161 | 35627 |
10 | Saxony | 8 | 112.658 | 27776 |
11 | Hamburg | 9 | 109.271 | 61729 |
12 | Schleswig-Holstein | 10 | 85.610 | 30134 |
13 | Brandenburg | 11 | 65.294 | 26493 |
14 | Thuringia | 12 | 56.811 | 26364 |
15 | Saxony-Anhalt | 13 | 56.217 | 25198 |
16 | Mecklenburg-Vorpommern | 14 | 39.869 | 24909 |
17 | Saarland | 15 | 35.028 | 35409 |
18 | Bremen | 16 | 31.590 | 47603 |
df = get_table_for_year(url, 2015, no_per_capita=False)
get_stats("GDP_EUR")
Average GDP_EUR: 189.11881250000002 (billion) Median GDP_EUR: 110.96449999999999 (billion) Minimal GDP_EUR: 31.59 (billion) Maximal GDP_EUR: 645.59 (billion)
get_stats("GDP_PC")
Average GDP_PC: 35772.8125 (thousands) Median GDP_PC: 34149.5 (thousands) Minimal GDP_PC: 24909 (thousands) Maximal GDP_PC: 61729 (thousands)
plot_gdp_per_capita(2015)
df = merge_tables([2008,2009,2010,2011,2012,2013,2015])
df.head(20) #first 20 rows
STATE | RANK | GDP_EUR | GDP_PC | YEAR | |
---|---|---|---|---|---|
0 | North Rhine-Westphalia | 1 | 547.54 | 30964.138117 | 2008 |
1 | Bavaria | 2 | 444.81 | 34901.862688 | 2008 |
2 | Baden-Württemberg | 3 | 365.06 | 33872.218249 | 2008 |
3 | Hesse | 4 | 221.35 | 36190.647787 | 2008 |
4 | Lower Saxony | 5 | 213.09 | 27109.159115 | 2008 |
5 | Rhineland-Palatinate | 6 | 106.37 | 26452.440938 | 2008 |
6 | Saxony | 7 | 94.92 | 23402.672668 | 2008 |
7 | Berlin | 8 | 88.58 | 25417.321253 | 2008 |
8 | Hamburg | 9 | 87.48 | 49418.926522 | 2008 |
9 | Schleswig-Holstein | 10 | 73.94 | 26026.267029 | 2008 |
10 | Brandenburg | 11 | 54.37 | 22060.598683 | 2008 |
11 | Saxony-Anhalt | 12 | 53.72 | 24078.780463 | 2008 |
12 | Thuringia | 13 | 50.38 | 23379.600626 | 2008 |
13 | Mecklenburg-Vorpommern | 14 | 35.70 | 22304.331039 | 2008 |
14 | Saarland | 15 | 31.05 | 31387.731996 | 2008 |
15 | Bremen | 16 | 27.43 | 41334.331907 | 2008 |
16 | North Rhine-Westphalia | 1 | 522.92 | 29571.843343 | 2009 |
17 | Bavaria | 2 | 423.84 | 33256.458896 | 2009 |
18 | Baden-Württemberg | 3 | 341.23 | 31661.143464 | 2009 |
19 | Hesse | 4 | 215.29 | 35199.839901 | 2009 |
plot_gdp_over_time(df)
states = ["Rhineland-Palatinate","Bavaria","Hesse","North Rhine-Westphalia"]
plot_gdp_over_time(df, states=states)