#!/usr/bin/env python # coding: utf-8 # ## Implementation of Salisman's Don't Overfit submission # From [Kaggle](http://www.kaggle.com/c/overfitting) # >In order to achieve this we have created a simulated data set with 200 variables and 20,000 cases. An ‘equation’ based on this data was created in order to generate a Target to be predicted. Given the all 20,000 cases, the problem is very easy to solve – but you only get given the Target value of 250 cases – the task is to build a model that gives the best predictions on the remaining 19,750 cases. # In[30]: import gzip import requests import zipfile url = "https://dl.dropbox.com/s/lnly9gw8pb1xhir/overfitting.zip" results = requests.get(url) # In[31]: import StringIO z = zipfile.ZipFile(StringIO.StringIO(results.content)) # z.extractall() # In[32]: z.extractall() # In[38]: z.namelist() # In[69]: d = z.open('overfitting.csv') d.readline() # In[70]: import numpy as np # In[63]: M = np.fromstring(d.read(), sep=",") # In[71]: len(d.read()) # In[75]: get_ipython().run_line_magic('pinfo', 'np.fromstring') # In[ ]: # In[4]: data = np.loadtxt("overfitting.csv", delimiter=",", skiprows=1) # In[5]: print """ There are also 5 other fields, case_id - 1 to 20,000, a unique identifier for each row train - 1/0, this is a flag for the first 250 rows which are the training dataset Target_Practice - we have provided all 20,000 Targets for this model, so you can develop your method completely off line. Target_Leaderboard - only 250 Targets are provided. You submit your predictions for the remaining 19,750 to the Kaggle leaderboard. Target_Evaluate - again only 250 Targets are provided. Those competitors who beat the 'benchmark' on the Leaderboard will be asked to make one further submission for the Evaluation model. """ data.shape # In[6]: ix_training = data[:, 1] == 1 ix_testing = data[:, 1] == 0 training_data = data[ix_training, 5:] testing_data = data[ix_testing, 5:] training_labels = data[ix_training, 2] testing_labels = data[ix_testing, 2] print "training:", training_data.shape, training_labels.shape print "testing: ", testing_data.shape, testing_labels.shape # ## Develop Tim's model # # He mentions that the X variables are from a Uniform distribution. Let's investigate this: # In[7]: figsize(12, 4) # In[8]: hist(training_data.flatten()) print training_data.shape[0] * training_data.shape[1] # looks pretty right # In[127]: import pymc as pm to_include = pm.Bernoulli("to_include", 0.5, size=200) # In[128]: coef = pm.Uniform("coefs", 0, 1, size=200) # In[129]: @pm.deterministic def Z(coef=coef, to_include=to_include, data=training_data): ym = np.dot(to_include * training_data, coef) return ym - ym.mean() # In[130]: @pm.deterministic def T(z=Z): return 0.45 * (np.sign(z) + 1.1) # In[132]: obs = pm.Bernoulli("obs", T, value=training_labels, observed=True) model = pm.Model([to_include, coef, Z, T, obs]) map_ = pm.MAP(model) map_.fit() # In[133]: mcmc = pm.MCMC(model) # In[176]: mcmc.sample(100000, 90000, 1) # In[177]: (np.round(T.value) == training_labels).mean() # In[178]: t_trace = mcmc.trace("T")[:] (np.round(t_trace[-500:-400, :]).mean(axis=0) == training_labels).mean() # In[179]: t_mean = np.round(t_trace).mean(axis=1) # In[180]: imshow(t_trace[-10000:, :], aspect="auto") colorbar() # In[181]: figsize(23, 8) coef_trace = mcmc.trace("coefs")[:] imshow(coef_trace[-10000:, :], aspect="auto", cmap=pyplot.cm.RdBu, interpolation="none") # In[183]: include_trace = mcmc.trace("to_include")[:] # In[184]: figsize(23, 8) imshow(include_trace[-10000:, :], aspect="auto", interpolation="none") # In[ ]: