%matplotlib inline import matplotlib.pyplot as plt import numpy as np days = 0,8,28,41,63,79,97,117,135,154 beetles = 2,47,192,256,768,896,1120,896,1184,1024 plt.plot(days, beetles) K0, r0 = 1000, 0.1 logistic = lambda t, K, r, N0=1.: K*N0 / (N0 + (K-N0) * np.exp(-r*t)) times = np.arange(160) plt.plot(times, logistic(times, K0, r0)) plt.plot(days, beetles, 'r.') def mse(params): K, r = params return np.mean([(logistic(t, K, r) - y)**2 for t,y in zip(days, beetles)]) mse([1000, 0.1]) from scipy.optimize import fmin_l_bfgs_b opt = fmin_l_bfgs_b(mse, (1000, 0.1), approx_grad=True) opt K_opt, r_opt = opt[0] plt.plot(times, logistic(times, K_opt, r_opt)) plt.plot(days, beetles, 'r.') data = np.c_[days, beetles] data from scipy.stats.distributions import norm logistic = lambda t, K, r, N0=1.: K*N0 / (N0 + (K-N0) * np.exp(-r*t)) def likelihood(params, data=data): K, r, s = params return -np.sum(norm.logpdf(y, logistic(t, K, r), s) for t,y in data) from scipy.optimize import minimize opt_mle = minimize(likelihood, (1000, 0.1, 1), method='Nelder-Mead') min_func = lambda s: minimize(likelihood, (1000, 0.1, 1), args=s, method='Nelder-Mead') def bootstrap(data, nsamples, f): boot_samples = data[np.random.randint(len(data), size=(nsamples, len(data)))] return [f(s)['x'] for s in boot_samples] estimates = bootstrap(data, 100, min_func) np.std(estimates, 0) np.corrcoef(np.array(estimates).T[:-1]) K_opt, r_opt = opt_mle['x'][:-1] plt.plot(times, logistic(times, K_opt, r_opt)) plt.plot(days, beetles, 'r.') import pandas as pd baseball = pd.read_table('../data/textbook/baseball.dat', sep='\s+') logsalary = baseball.salary.apply(np.log) predictors = baseball.ix[:, 'average':] nrows, ncols = predictors.shape # n * ln(SSE/n) - k * ln(n) aic = lambda g: g.nobs * np.log((g.resid**2).sum()/g.nobs) + 2*len(g.beta) periods = 15 period_lengths = [60]*(periods/3) + [120]*(periods/3) + [220]*(periods/3) tau_start = 10 cooling_schedule = [tau_start * 0.9**i for i in range(periods)] def anneal(periods, period_lengths, cooling_schedule, nbrhd=1): aic_values = [] tau = cooling_schedule solution_current = solution_best = np.random.binomial(1, 0.5, ncols).astype(bool) solution_vars = predictors[predictors.columns[solution_current]] g = pd.ols(y=logsalary, x=solution_vars) aic_best = aic(g) aic_values.append(aic_best) for j in range(periods): for i in range(period_lengths[j]): solution_step = solution_current.copy() flip = np.random.randint(0, ncols, nbrhd) solution_step[flip] = solution_current[flip] - True solution_vars = predictors[predictors.columns[solution_step]] g = pd.ols(y=logsalary, x=solution_vars) aic_step = aic(g) alpha = min(1, np.exp((aic_values[-1] - aic_step)/tau[j])) if ((aic_step < aic_values[-1]) or (random.uniform() < alpha)): # Accept proposed solution solution_current = solution_step.copy() aic_values.append(aic_step) if aic_step < aic_best: # Replace previous best with this one aic_best = aic_step solution_best = solution_step.copy() else: # Retain current solution aic_values.append(aic_values[-1]) return solution_best, aic_values # Schedule 1: cooling = [60]*(periods/3) + [120]*(periods/3) + [220]*(periods/3) solution1, aic1 = anneal(periods, period_lengths, cooling_schedule) periods2 = 2000 period_lengths2 = [1]*periods2 cooling_schedule2 = [tau_start * 0.999**i for i in range(periods2)] solution2, aic2 = anneal(periods2, period_lengths2, cooling_schedule2) def plot_aic(aic_values, solution_best): plt.plot(aic_values) xlim(0, len(aic_values)) xlabel('Iteration'); ylabel('AIC') aic_best = np.min(aic_values) print('Best AIC: {0}\nBest solution: {1}\nDiscovered at iteration {2}'.format(aic_best, np.where(solution_best==True), np.where(aic_values==aic_best)[0][0])) plt.plot(np.where(aic_values==aic_best)[0][0], aic_best, 'ro') plot_aic(aic1, solution1) plot_aic(aic2, solution2) solution_n2, aic_n2 = anneal(periods, period_lengths, cooling_schedule, nbrhd=2) solution_n3, aic_n3 = anneal(periods, period_lengths, cooling_schedule, nbrhd=3) def calculate_fitness(aic_values): P = len(aic_values) aic_rank = (-aic_values).argsort().argsort()+1. return 2.*aic_rank/(P*(P+1.)) def run_ga(pop_size=20, iterations=100, mutation_rate=0.05): aic_best = [] best_solution = [] aic_history = [] # Initialize genotype current_gen = np.random.binomial(1, 0.5, pop_size*ncols).reshape((pop_size, ncols)) for i in range(iterations): # Get phenotype current_phe = [predictors[predictors.columns[g.astype(bool)]] for g in current_gen] # Calculate AIC current_aic = np.array([aic(pd.ols(y=logsalary, x=x)) for x in current_phe]) # Get lowest AIC aic_best.append(current_aic[np.argmin(current_aic)]) best_solution.append(current_gen[np.argmin(current_aic)]) # Calculate fitness according to AIC rank fitness = calculate_fitness(current_aic) # Choose first parents according to fitness moms = np.random.choice(range(pop_size), size=pop_size/2, p=fitness) # Choose second parents randomly dads = np.random.choice(range(pop_size), size=pop_size/2) next_gen = [] for x,y in zip(current_gen[moms], current_gen[dads]): # Crossover cross = np.random.randint(0, ncols) child1 = np.r_[x[:cross], y[cross:]] child2 = np.r_[y[:cross], x[cross:]] # Mutate m1 = np.random.binomial(1, mutation_rate, size=ncols).astype(bool) child1[m1] = abs(child1[m1]-1) m2 = np.random.binomial(1, mutation_rate, size=ncols) child2[m2] = abs(child1[m2]-1) next_gen += [child1, child2] # Increment generation current_gen = np.array(next_gen) # Store AIC values aic_history.append(current_aic) aic_best_min = np.min(aic_best) fig, axes = plt.subplots(nrows=1, ncols=2) print('Best AIC: {0}\nBest solution: {1}\nDiscovered at iteration {2}'.format(aic_best_min, np.where(best_solution[np.where(aic_history==aic_best_min)[0][0]]), np.where(aic_history==aic_best_min)[0][0])) axes[0].plot(aic_best, 'r-') axes[0].set_xlim(0, len(aic_best)) axes[0].set_xlabel('Iteration') axes[0].set_ylabel('AIC') for i,x in enumerate(aic_history): axes[1].plot(np.ones(len(x))*i, -x, 'r.', alpha=0.3) run_ga() run_ga(mutation_rate=0.1) run_ga(mutation_rate=0.001) run_ga(pop_size=100, mutation_rate=0.05) run_ga(pop_size=10, mutation_rate=0.05) def run_ga2(pop_size=20, iterations=100, mutation_rate=0.2): aic_best = [] best_solution = [] aic_history = [] # Initialize genotype current_gen = np.random.binomial(1, 0.5, pop_size*ncols).reshape((pop_size, ncols)) for i in range(iterations): # Get phenotype current_phe = [predictors[predictors.columns[g.astype(bool)]] for g in current_gen] # Calculate AIC current_aic = np.array([aic(pd.ols(y=logsalary, x=x)) for x in current_phe]) # Get lowest AIC aic_best.append(current_aic[np.argmin(current_aic)]) best_solution.append(current_gen[np.argmin(current_aic)]) # Calculate fitness according to AIC rank fitness = calculate_fitness(current_aic) # Choose first parents according to fitness moms = np.random.choice(range(pop_size), size=pop_size/2, p=fitness) # Choose second parents randomly dads = np.random.choice(range(pop_size), size=pop_size/2, p=fitness) next_gen = [] for x,y in zip(current_gen[moms], current_gen[dads]): # Crossover cross = np.random.randint(0, ncols) child1 = np.r_[x[:cross], y[cross:]] child2 = np.r_[y[:cross], x[cross:]] # Mutate m1 = np.random.binomial(1, mutation_rate, size=ncols).astype(bool) child1[m1] = abs(child1[m1]-1) m2 = np.random.binomial(1, mutation_rate, size=ncols) child2[m2] = abs(child1[m2]-1) next_gen += [child1, child2] # Increment generation current_gen = np.array(next_gen) # Store AIC values aic_history.append(current_aic) aic_best_min = np.min(aic_best) fig, axes = plt.subplots(nrows=1, ncols=2) print('Best AIC: {0}\nBest solution: {1}\nDiscovered at iteration {2}'.format(aic_best_min, np.where(best_solution[np.where(aic_history==aic_best_min)[0][0]]), np.where(aic_history==aic_best_min)[0][0])) axes[0].plot(aic_best, 'r-') axes[0].set_xlim(0, len(aic_best)) axes[0].set_xlabel('Iteration') axes[0].set_ylabel('AIC') for i,x in enumerate(aic_history): axes[1].plot(np.ones(len(x))*i, -x, 'r.', alpha=0.3) run_ga2(pop_size=20, mutation_rate=0.05) def parse_latlon(x): d, m, s = map(float, x.split(':')) ms = m/60. + s/3600. if d<0: return d - ms return d + ms cities = pd.read_csv('../data/brasil_capitals.txt', names=['city','lat','lon'])[['lat','lon']].applymap(parse_latlon).values ncities = len(cities) periods = 60 cooling = [60]*(periods/3) + [120]*(periods/3) + [220]*(periods/3) tau_start = 50 tau = [tau_start]*periods #switches = [4]*(periods/3) + [3]*(periods/3) + [2]*(periods/3) from scipy.spatial.distance import euclidean # Calculate total distance among consecutive cities distance = lambda s: int(np.sum([euclidean(c1,c2) for c1,c2 in zip(cities[s][:-1], cities[s][1:])])) solution_current = solution_best = np.concatenate([[0], np.random.permutation(range(1, len(cities))), [0]]) distance_best = distance(solution_current) distance_values = [distance_best] # Cooling schedule tau = [tau_start * 0.9**i for i in range(periods)] for j in range(periods): for i in range(cooling[j]): solution_step = solution_current.copy() switch = np.random.randint(1, ncities, 2) solution_step[switch] = solution_step[switch[::-1]] d = distance(solution_step) alpha = min(1, np.exp((distance_values[-1] - d)/tau[j])) if ((d < distance_values[-1]) or (random.uniform() < alpha)): # Accept proposed solution solution_current = solution_step.copy() distance_values.append(d) if d < distance_best: # Replace previous best with this one distance_best = d solution_best = solution_step.copy() else: # Retain current solution distance_values.append(distance_values[-1]) plt.plot(distance_values) xlim(0, len(distance_values)) xlabel('Iteration'); ylabel('Distance') print('Shortest Distance: {0}\nBest solution: {1}\nDiscovered at iteration {2}'.format(distance_best, solution_best, distance_values.index(distance_best))) plt.plot(distance_values.index(distance_best), distance_best, 'ro') from mpl_toolkits.basemap import Basemap xmin, xmax, ymin, ymax = -70.0, -20.0, -40.0, 10.0 fig = plt.figure(figsize=(11.7,8.3)) bm = Basemap(projection='merc', \ llcrnrlon=xmin, llcrnrlat=ymin, \ urcrnrlon=xmax, urcrnrlat=ymax, \ lon_0=0.5*(xmax + xmin), lat_0=0.5*(ymax + ymin), \ resolution='l', area_thresh=1000000) bm.drawcoastlines(linewidth=1.5) bm.bluemarble() bm.drawcountries(linewidth=2) bm.drawstates(linewidth=1) for i,c in enumerate(cities): pty = 'r.' if not i: pty = 'co' bm.plot(-c[0], c[1], pty, latlon=True) _x, _y = bm(-cities[i][0], cities[i][1]) plt.text(_x, _y, repr(int(i)), fontsize=9, color='yellow') xx, yy = cities[solution_best].T bm.plot(-xx, yy, 'y--',latlon=True) def logodds_post(theta, n=5, y=5, mu=0, sigma2=0.25): lpost = y*theta - n*np.log(1. + np.exp(theta)) lpost -= (theta - mu)**2 / (2*sigma2) return lpost logodds_post_min = lambda *args: -logodds_post(*args) from scipy.optimize import fmin_bfgs opt = fmin_bfgs(logodds_post_min, 1., full_output=True) mode, var = opt[0], opt[3] mode, var from scipy.stats.distributions import norm 1. - norm.cdf(0, loc=mode[0], scale=np.sqrt(var[0][0])) def calc_diff(theta): return logodds_post(theta) - norm.logpdf(theta, loc=0, scale=0.25) calc_diff_min = lambda *args: -calc_diff(*args) opt = fmin_bfgs(calc_diff_min, 1, full_output=True) c = opt[1] c def reject(n, c, mu=0, s=0.25): k = len(mode) # Draw samples from g(theta) theta = norm.rvs(loc=mu, scale=s, size=n) # Calculate probability under g(theta) gvals = norm.logpdf(theta, loc=mu, scale=np.sqrt(s)) # Calculate probability under f(theta) fvals = np.array([logodds_post(t) for t in theta]) # Calculate acceptance probability p = np.exp(fvals - gvals + c) return theta[np.random.random(n) < p] nsamples = 1000000 samples = reject(nsamples, c) np.mean(samples>0) n = 100000 mu, s = mode[0], var[0][0] theta = norm.rvs(loc=mu, scale=np.sqrt(s), size=n) f_theta = np.array([logodds_post(t) for t in theta]) q_theta = norm.logpdf(theta, loc=mu, scale=np.sqrt(s)) w = np.exp(f_theta - q_theta - max(f_theta - q_theta)) p_sir = w/w.sum() theta_sir = theta[np.random.choice(range(len(theta)), size=10000, p=p_sir)] plt.hist(theta_sir) np.mean(theta_sir>0)