import gzip import requests import zipfile url = "https://dl.dropbox.com/s/lnly9gw8pb1xhir/overfitting.zip" results = requests.get(url) import StringIO z = zipfile.ZipFile(StringIO.StringIO(results.content)) #z.extractall() z.extractall() z.namelist() d = z.open('overfitting.csv') d.readline() import numpy as np M = np.fromstring(d.read(), sep="," ) len(d.read()) np.fromstring? data = np.loadtxt("overfitting.csv", delimiter=",", skiprows=1) print """ There are also 5 other fields, case_id - 1 to 20,000, a unique identifier for each row train - 1/0, this is a flag for the first 250 rows which are the training dataset Target_Practice - we have provided all 20,000 Targets for this model, so you can develop your method completely off line. Target_Leaderboard - only 250 Targets are provided. You submit your predictions for the remaining 19,750 to the Kaggle leaderboard. Target_Evaluate - again only 250 Targets are provided. Those competitors who beat the 'benchmark' on the Leaderboard will be asked to make one further submission for the Evaluation model. """ data.shape ix_training = data[:,1] == 1 ix_testing = data[:,1] == 0 training_data = data[ ix_training, 5: ] testing_data = data[ ix_testing, 5: ] training_labels = data[ ix_training, 2] testing_labels = data[ ix_testing, 2] print "training:", training_data.shape, training_labels.shape print "testing: ", testing_data.shape, testing_labels.shape figsize( 12, 4 ) hist( training_data.flatten() ) print training_data.shape[0]*training_data.shape[1] import pymc as pm to_include = pm.Bernoulli( "to_include", 0.5, size= 200 ) coef = pm.Uniform( "coefs", 0, 1, size = 200 ) @pm.deterministic def Z( coef = coef, to_include = to_include, data = training_data ): ym = np.dot( to_include*training_data, coef ) return ym - ym.mean() @pm.deterministic def T( z = Z ): return 0.45*(np.sign(z) + 1.1) obs = pm.Bernoulli( "obs", T, value = training_labels, observed = True) model = pm.Model( [to_include, coef, Z, T, obs] ) map_ = pm.MAP( model ) map_.fit() mcmc = pm.MCMC( model ) mcmc.sample(100000, 90000,1) (np.round(T.value) == training_labels ).mean() t_trace = mcmc.trace("T")[:] (np.round( t_trace[-500:-400,:]).mean(axis=0) == training_labels ).mean() t_mean = np.round( t_trace).mean(axis=1) imshow(t_trace[-10000:,:], aspect="auto") colorbar() figsize( 23, 8) coef_trace = mcmc.trace("coefs")[:] imshow(coef_trace[-10000:,:], aspect="auto", cmap=pyplot.cm.RdBu, interpolation="none") include_trace = mcmc.trace("to_include")[:] figsize( 23, 8) imshow(include_trace[-10000:,:], aspect="auto", interpolation="none")