using Gadfly using DataFrames df=readtable("train.csv") describe(df) typeof(df) df[1,:] df[:Name] pool!(df,[:Sex]) pool!(df,[:Survived]) pool!(df,[:Pclass]) plot(df,x="Sex",color="Survived",Geom.histogram) df[!isna(df[:Age]),:] averageAge=mean(df[!isna(df[:Age]),:Age]) df[:Age]=array(df[:Age],averageAge) typeof(df[:Sex]) plot(x=df[!isna(df[:Embarked]),:Embarked],Geom.histogram) df[:Embarked]=array(df[:Embarked],utf8("S")) pool!(df,[:Embarked]) typeof(df[:Embarked]) newdata=df[:,[:Pclass,:Age,:Sex,:SibSp,:Parch,:Fare,:Embarked]] describe(newdata) using DecisionTree xTrain=newdata yTrain=df[:Survived] yTrain=array(yTrain) xTrain=array(xTrain) accuracy = nfoldCV_forest(yTrain, xTrain, 5, 20, 4, 0.7)