import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from IPython import display
from collections import namedtuple
Dynamic programming solves a problem by breaking it up into smaller sub-problems, looks up the answer or solves and saves results and then combines the answers together to solve the original problem.
Key question:
A simple example:
def fib_recursion(n, table=[]):
"""returns fib of n"""
if n <= 1:
return n
if len(table) < 1:
table = np.zeros(n+1)
if table[n-1] == 0:
table[n-1] = fib_recursion(n-1, table)
if table[n-2] == 0:
table[n-2] = fib_recursion(n-2, table)
table[n] = table[n-1] + table[n-2]
return table[n]
[fib_recursion(n) for n in range(10)]
[0, 1, 1.0, 2.0, 3.0, 5.0, 8.0, 13.0, 21.0, 34.0]
Now what if we use a list:'
def fib_list(n):
table=[0,1]
for i in range(2, n+1):
table.append(table[i-1]+table[i-2])
return table[n]
[fib_list(n) for n in range(10)]
[0, 1, 1, 2, 3, 5, 8, 13, 21, 34]
Now there isn't any point in saving all the fib numbers, so modifying the code to save only the last two fib numbers:
def fib_list_two(n):
table=[0,1]
for i in range(2, n+1):
table.append(sum(table[-2:]))
del table[0]
return table[1]
[fib_list_two(n) for n in range(10)]
[1, 1, 1, 2, 3, 5, 8, 13, 21, 34]
%time fib_recursion(100)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns Wall time: 455 µs
3.54224848179262e+20
The recursion fails as exceeds max memory depth on pretty small fib numbers
%time fib_list(1000)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns Wall time: 525 µs
43466557686937456435688527675040625802564660517371780402481729089536555417949051890403879840079255169295922593080322634775209689623239873322471161642996440906533187938298969649928516003704476137795166849228875
Now this holds all the fib numbers up to n in memory in a list. Which for large n seems un necessary to waste all that memory.
%time fib_list_two(1000)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns Wall time: 1.17 ms
43466557686937456435688527675040625802564660517371780402481729089536555417949051890403879840079255169295922593080322634775209689623239873322471161642996440906533187938298969649928516003704476137795166849228875
income_url="http://docs.google.com/spreadsheet/pub?key=0AkBd6lyS3EmpdHo5S0J6ekhVOF9QaVhod05QSGV4T3c&output=xlsx"
life_url="http://docs.google.com/spreadsheet/pub?key=phAwcNAVuyj2tPLxKvvnNPA&output=xlsx"
pop_url="http://docs.google.com/spreadsheet/pub?key=phAwcNAVuyj0XOoBL_n5tAQ&output=xlsx"
income = pd.read_excel(income_url)
life = pd.read_excel(life_url)
pop = pd.read_excel(pop_url)
income.head()
Income per person (fixed 2000 US$) | 1960 | 1961 | 1962 | 1963 | 1964 | 1965 | 1966 | 1967 | 1968 | ... | 2002 | 2003 | 2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Abkhazia | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | Afghanistan | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | Akrotiri and Dhekelia | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | Albania | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1313.722725 | 1381.040832 | 1454.022854 | 1525.723589 | 1594.495067 | 1681.613910 | 1804.419415 | 1857.352947 | 1915.424459 | 1965.707230 |
4 | Algeria | 1280.384828 | 1085.414612 | 855.947986 | 1128.41578 | 1170.323896 | 1215.015783 | 1127.614288 | 1200.558225 | 1291.863983 | ... | 1871.921986 | 1971.512803 | 2043.135713 | 2115.186028 | 2124.957754 | 2155.485231 | 2173.787903 | 2192.703976 | 2231.980246 | 2255.225482 |
5 rows × 53 columns
life.head()
Life expectancy | 1800 | 1801 | 1802 | 1803 | 1804 | 1805 | 1806 | 1807 | 1808 | ... | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Abkhazia | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | Afghanistan | 28.21 | 28.20 | 28.19 | 28.18 | 28.17 | 28.16 | 28.15 | 28.14 | 28.13 | ... | 52.4 | 52.8 | 53.3 | 53.6 | 54.0 | 54.4 | 54.8 | 54.9 | 53.8 | 52.72 |
2 | Akrotiri and Dhekelia | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | Albania | 35.40 | 35.40 | 35.40 | 35.40 | 35.40 | 35.40 | 35.40 | 35.40 | 35.40 | ... | 76.6 | 76.8 | 77.0 | 77.2 | 77.4 | 77.5 | 77.7 | 77.9 | 78.0 | 78.10 |
4 | Algeria | 28.82 | 28.82 | 28.82 | 28.82 | 28.82 | 28.82 | 28.82 | 28.82 | 28.82 | ... | 75.3 | 75.5 | 75.7 | 76.0 | 76.1 | 76.2 | 76.3 | 76.3 | 76.4 | 76.50 |
5 rows × 218 columns
pop.head()
Total population | 1800 | 1810 | 1820 | 1830 | 1840 | 1850 | 1860 | 1870 | 1880 | ... | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Abkhazia | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | Afghanistan | 3280000.0 | 3280000.0 | 3323519.0 | 3448982.0 | 3625022.0 | 3810047.0 | 3973968.0 | 4169690.0 | 4419695.0 | ... | 25183615.0 | 25877544.0 | 26528741.0 | 27207291.0 | 27962207.0 | 28809167.0 | 29726803.0 | 30682500.0 | 31627506.0 | 32526562.0 |
2 | Akrotiri and Dhekelia | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 15700.0 | 15700.0 | 15700.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | Albania | 410445.0 | 423591.0 | 438671.0 | 457234.0 | 478227.0 | 506889.0 | 552800.0 | 610036.0 | 672544.0 | ... | 3050741.0 | 3010849.0 | 2968026.0 | 2929886.0 | 2901883.0 | 2886010.0 | 2880667.0 | 2883281.0 | 2889676.0 | 2896679.0 |
4 | Algeria | 2503218.0 | 2595056.0 | 2713079.0 | 2880355.0 | 3082721.0 | 3299305.0 | 3536468.0 | 3811028.0 | 4143163.0 | ... | 33749328.0 | 34261971.0 | 34811059.0 | 35401790.0 | 36036159.0 | 36717132.0 | 37439427.0 | 38186135.0 | 38934334.0 | 39666519.0 |
5 rows × 82 columns
The most recent year where we have info on all three is 2011, so lets look at 2011 data:
df = pop[["Total population", 2011]].copy()
df.rename(columns={2011: "Population"}, inplace=True)
df.head()
Total population | Population | |
---|---|---|
0 | Abkhazia | NaN |
1 | Afghanistan | 28809167.0 |
2 | Akrotiri and Dhekelia | NaN |
3 | Albania | 2886010.0 |
4 | Algeria | 36717132.0 |
df = df.join(life[2011])
df.rename(columns={2011: "Life"}, inplace=True)
df.head()
Total population | Population | Life | |
---|---|---|---|
0 | Abkhazia | NaN | NaN |
1 | Afghanistan | 28809167.0 | 54.0 |
2 | Akrotiri and Dhekelia | NaN | NaN |
3 | Albania | 2886010.0 | 77.4 |
4 | Algeria | 36717132.0 | 76.1 |
df = df.join(income["2011"])
df.rename(columns={"2011": "Income"}, inplace=True)
df.head()
Total population | Population | Life | Income | |
---|---|---|---|---|
0 | Abkhazia | NaN | NaN | NaN |
1 | Afghanistan | 28809167.0 | 54.0 | NaN |
2 | Akrotiri and Dhekelia | NaN | NaN | NaN |
3 | Albania | 2886010.0 | 77.4 | 1965.707230 |
4 | Algeria | 36717132.0 | 76.1 | 2255.225482 |
So now we have a dataframe containing country names, their population, adjusted per capita income and life expectance.
To simplify things, I'm going to drop the missing values:
df.dropna(inplace=True)
df.head()
Total population | Population | Life | Income | |
---|---|---|---|---|
3 | Albania | 2886010.0 | 77.4 | 1965.707230 |
4 | Algeria | 36717132.0 | 76.1 | 2255.225482 |
7 | Angola | 21942296.0 | 58.1 | 629.955306 |
9 | Antigua and Barbuda | 88152.0 | 75.9 | 9977.957073 |
10 | Argentina | 41655616.0 | 76.0 | 11601.630223 |
Lets say we want 500 million people with the highest income. So the population becomes the "weights", and the income the "values" of this knapsack problem.
So lets divide this problem into sizes of 10M, so we go 10, 20, 30... all the way to 500M
first, starting with 10 to 50M to keep it simple, heres our table:
solved = pd.DataFrame(columns=np.arange(10,51,10), index=df["Total population"])
solved.fillna(value=0, inplace=True)
solved.head()
10 | 20 | 30 | 40 | 50 | |
---|---|---|---|---|---|
Total population | |||||
Albania | 0 | 0 | 0 | 0 | 0 |
Algeria | 0 | 0 | 0 | 0 | 0 |
Angola | 0 | 0 | 0 | 0 | 0 |
Antigua and Barbuda | 0 | 0 | 0 | 0 | 0 |
Argentina | 0 | 0 | 0 | 0 | 0 |
ans = {}
ans[10] = 100, 200, ["countryA", "B"]
ans[10]
(100, 200, ['countryA', 'B'])
def pop_income(pop_size):
"""takes in a pop_size, and returns a tuple containing
max income and pop and a list of countries chosen"""
if pop_size in solved.keys():
return solved[max]
pop = 0
income = 0
for i, row in df[df["Population"]<= pop_size].iterrows():
print(row)
pop_income(10*np.power(10,4))
np.power(10,9)
Total population Antigua and Barbuda Population 88152 Life 75.9 Income 9977.96 Name: 9, dtype: object Total population Dominica Population 71402 Life 73.4 Income 6518.66 Name: 61, dtype: object Total population Marshall Islands Population 52541 Life 66 Income 2522.82 Name: 139, dtype: object Total population Seychelles Population 93810 Life 73.4 Income 9279.11 Name: 202, dtype: object
1000000000