# rattle package in R has weather dataset
#(see help at http://artax.karlin.mff.cuni.cz/r-help/library/rattle/html/weather.html)
import os as os
import pandas as pd
os.getcwd()
'/home/ajayohri'
os.listdir()
['.hplip', '.xsession-errors.old', 'VirtualBox VMs', 'filename.pkl_04.npy', '.thunderbird', 'SVM.R', 'R', 'Desktop', 'filename.pkl_07.npy', '.cache', '.webex', 'file.R', '.ipython', 'unique_ids_for_list.html', 'filename.pkl_11.npy', '.Xauthority', 'Dropbox', 'examples.desktop', 'machine learning-plot and bagged pima indians.ipynb', 'date time.ipynb', 'Untitled.ipynb', '.rstudio-desktop', 'filename.pkl_01.npy', 'anaconda3', '.dropbox', 'Music', '.pki', 'rsconnect', 'GoodReads.ipynb', '.config', 'diamsum.html', 'filename.pkl_06.npy', 'data inspection .ipynb', '.sudo_as_admin_successful', '.continuum', '.java', 'unique ids for list.R', '.bashrc-anaconda3.bak', '.texmf-var', 'numpy scipy pandas.ipynb', 'mozilla.pdf', '.dropbox-dist', '.bash_logout', '.jupyter', '.ecryptfs', '.dbus', '.local', '.lyx', '.xsession-errors', 'hebrew', 'RCommanderMarkdown.Rmd', '.bash_history', 'SAS', 'nbr2mp4.sh', '.adobe', '.Skype', 'filename.pkl_05.npy', '.wajig', 'ajay ohri.odt', '.macromedia', '.gphoto', '.oracle_jre_usage', 'machine learning-rattle dataset from R.ipynb', '.profile', 'file operations.ipynb', 'Documents', 'filename.pkl_09.npy', 'Videos', 'RCommander.R', 'filename.pkl_08.npy', '.gstreamer-0.10', 'SVM.html', '.Private', 'RCommander.txt', 're for searching strings.ipynb', '.Rhistory', 'filename.pkl_02.npy', 'RcmdrMarkdown.Rmd', 'Scikit Tutorial', 'machine learning.ipynb', '.ivy2', 'assignment2.R', 'assignment2.html', 'filename.pkl_03.npy', 'Public', 'nbr2mp4.tar', 'RcmdrMarkdown.md', '.bashrc', '.mozilla', 'Pictures', 'Data Viz Tutorial.ipynb', 'filename.pkl_10.npy', '.RData', '.gconf', 'data transformations.ipynb', 'RcmdrMarkdown.html', 'file.html', 'Scikit Tutorial.ipynb', 'Strings, Lists and Maps.ipynb', 'filename.pkl', 'weather.csv', 'Downloads', '.gnupg', '.nano', 'variables in strings.ipynb', 'Templates', '.ICEauthority', '.ipynb_checkpoints']
#Finding only csv files in a directory using os and glob packages
import glob
path = os.getcwd()
extension = 'csv'
os.chdir(path)
result = [i for i in glob.glob('*.{}'.format(extension))]
print(result)
['weather.csv']
dataframe=pd.read_csv("weather.csv")
dataframe.head()
Unnamed: 0 | Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | ... | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RISK_MM | RainTomorrow | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2007-11-01 | Canberra | 8.0 | 24.3 | 0.0 | 3.4 | 6.3 | NW | 30.0 | ... | 29 | 1019.7 | 1015.0 | 7 | 7 | 14.4 | 23.6 | No | 3.6 | Yes |
1 | 2 | 2007-11-02 | Canberra | 14.0 | 26.9 | 3.6 | 4.4 | 9.7 | ENE | 39.0 | ... | 36 | 1012.4 | 1008.4 | 5 | 3 | 17.5 | 25.7 | Yes | 3.6 | Yes |
2 | 3 | 2007-11-03 | Canberra | 13.7 | 23.4 | 3.6 | 5.8 | 3.3 | NW | 85.0 | ... | 69 | 1009.5 | 1007.2 | 8 | 7 | 15.4 | 20.2 | Yes | 39.8 | Yes |
3 | 4 | 2007-11-04 | Canberra | 13.3 | 15.5 | 39.8 | 7.2 | 9.1 | NW | 54.0 | ... | 56 | 1005.5 | 1007.0 | 2 | 7 | 13.5 | 14.1 | Yes | 2.8 | Yes |
4 | 5 | 2007-11-05 | Canberra | 7.6 | 16.1 | 2.8 | 5.6 | 10.6 | SSE | 50.0 | ... | 49 | 1018.3 | 1018.5 | 7 | 7 | 11.1 | 15.4 | Yes | 0.0 | No |
5 rows × 25 columns
dataframe.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 366 entries, 0 to 365 Data columns (total 25 columns): Unnamed: 0 366 non-null int64 Date 366 non-null object Location 366 non-null object MinTemp 366 non-null float64 MaxTemp 366 non-null float64 Rainfall 366 non-null float64 Evaporation 366 non-null float64 Sunshine 363 non-null float64 WindGustDir 363 non-null object WindGustSpeed 364 non-null float64 WindDir9am 335 non-null object WindDir3pm 365 non-null object WindSpeed9am 359 non-null float64 WindSpeed3pm 366 non-null int64 Humidity9am 366 non-null int64 Humidity3pm 366 non-null int64 Pressure9am 366 non-null float64 Pressure3pm 366 non-null float64 Cloud9am 366 non-null int64 Cloud3pm 366 non-null int64 Temp9am 366 non-null float64 Temp3pm 366 non-null float64 RainToday 366 non-null object RISK_MM 366 non-null float64 RainTomorrow 366 non-null object dtypes: float64(12), int64(6), object(7) memory usage: 71.6+ KB
dataframe=dataframe.drop('Unnamed: 0', 1)
dataframe.describe()
/home/ajayohri/anaconda3/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile RuntimeWarning)
MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustSpeed | WindSpeed9am | WindSpeed3pm | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RISK_MM | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 363.000000 | 364.000000 | 359.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 |
mean | 7.265574 | 20.550273 | 1.428415 | 4.521858 | 7.909366 | 39.840659 | 9.651811 | 17.986339 | 72.035519 | 44.519126 | 1019.709016 | 1016.810383 | 3.890710 | 4.024590 | 12.358470 | 19.230874 | 1.428415 |
std | 6.025800 | 6.690516 | 4.225800 | 2.669383 | 3.481517 | 13.059807 | 7.951929 | 8.856997 | 13.137058 | 16.850947 | 6.686212 | 6.469422 | 2.956131 | 2.666268 | 5.630832 | 6.640346 | 4.225800 |
min | -5.300000 | 7.600000 | 0.000000 | 0.200000 | 0.000000 | 13.000000 | 0.000000 | 0.000000 | 36.000000 | 13.000000 | 996.500000 | 996.800000 | 0.000000 | 0.000000 | 0.100000 | 5.100000 | 0.000000 |
25% | 2.300000 | 15.025000 | 0.000000 | 2.200000 | NaN | NaN | NaN | 11.000000 | 64.000000 | 32.250000 | 1015.350000 | 1012.800000 | 1.000000 | 1.000000 | 7.625000 | 14.150000 | 0.000000 |
50% | 7.450000 | 19.650000 | 0.000000 | 4.200000 | NaN | NaN | NaN | 17.000000 | 72.000000 | 43.000000 | 1020.150000 | 1017.400000 | 3.500000 | 4.000000 | 12.550000 | 18.550000 | 0.000000 |
75% | 12.500000 | 25.500000 | 0.200000 | 6.400000 | NaN | NaN | NaN | 24.000000 | 81.000000 | 55.000000 | 1024.475000 | 1021.475000 | 7.000000 | 7.000000 | 17.000000 | 24.000000 | 0.200000 |
max | 20.900000 | 35.800000 | 39.800000 | 13.800000 | 13.600000 | 98.000000 | 41.000000 | 52.000000 | 99.000000 | 96.000000 | 1035.700000 | 1033.200000 | 8.000000 | 8.000000 | 24.700000 | 34.500000 | 39.800000 |
dataframe['RainTomorrow'].unique()
array(['Yes', 'No'], dtype=object)
dataframe['RainToday'].unique()
array(['No', 'Yes'], dtype=object)
dataframe['Location'].unique()
array(['Canberra'], dtype=object)
dataframe['Date'].unique()
array(['2007-11-01', '2007-11-02', '2007-11-03', '2007-11-04', '2007-11-05', '2007-11-06', '2007-11-07', '2007-11-08', '2007-11-09', '2007-11-10', '2007-11-11', '2007-11-12', '2007-11-13', '2007-11-14', '2007-11-15', '2007-11-16', '2007-11-17', '2007-11-18', '2007-11-19', '2007-11-20', '2007-11-21', '2007-11-22', '2007-11-23', '2007-11-24', '2007-11-25', '2007-11-26', '2007-11-27', '2007-11-28', '2007-11-29', '2007-11-30', '2007-12-01', '2007-12-02', '2007-12-03', '2007-12-04', '2007-12-05', '2007-12-06', '2007-12-07', '2007-12-08', '2007-12-09', '2007-12-10', '2007-12-11', '2007-12-12', '2007-12-13', '2007-12-14', '2007-12-15', '2007-12-16', '2007-12-17', '2007-12-18', '2007-12-19', '2007-12-20', '2007-12-21', '2007-12-22', '2007-12-23', '2007-12-24', '2007-12-25', '2007-12-26', '2007-12-27', '2007-12-28', '2007-12-29', '2007-12-30', '2007-12-31', '2008-01-01', '2008-01-02', '2008-01-03', '2008-01-04', '2008-01-05', '2008-01-06', '2008-01-07', '2008-01-08', '2008-01-09', '2008-01-10', '2008-01-11', '2008-01-12', '2008-01-13', '2008-01-14', '2008-01-15', '2008-01-16', '2008-01-17', '2008-01-18', '2008-01-19', '2008-01-20', '2008-01-21', '2008-01-22', '2008-01-23', '2008-01-24', '2008-01-25', '2008-01-26', '2008-01-27', '2008-01-28', '2008-01-29', '2008-01-30', '2008-01-31', '2008-02-01', '2008-02-02', '2008-02-03', '2008-02-04', '2008-02-05', '2008-02-06', '2008-02-07', '2008-02-08', '2008-02-09', '2008-02-10', '2008-02-11', '2008-02-12', '2008-02-13', '2008-02-14', '2008-02-15', '2008-02-16', '2008-02-17', '2008-02-18', '2008-02-19', '2008-02-20', '2008-02-21', '2008-02-22', '2008-02-23', '2008-02-24', '2008-02-25', '2008-02-26', '2008-02-27', '2008-02-28', '2008-02-29', '2008-03-01', '2008-03-02', '2008-03-03', '2008-03-04', '2008-03-05', '2008-03-06', '2008-03-07', '2008-03-08', '2008-03-09', '2008-03-10', '2008-03-11', '2008-03-12', '2008-03-13', '2008-03-14', '2008-03-15', '2008-03-16', '2008-03-17', '2008-03-18', '2008-03-19', '2008-03-20', '2008-03-21', '2008-03-22', '2008-03-23', '2008-03-24', '2008-03-25', '2008-03-26', '2008-03-27', '2008-03-28', '2008-03-29', '2008-03-30', '2008-03-31', '2008-04-01', '2008-04-02', '2008-04-03', '2008-04-04', '2008-04-05', '2008-04-06', '2008-04-07', '2008-04-08', '2008-04-09', '2008-04-10', '2008-04-11', '2008-04-12', '2008-04-13', '2008-04-14', '2008-04-15', '2008-04-16', '2008-04-17', '2008-04-18', '2008-04-19', '2008-04-20', '2008-04-21', '2008-04-22', '2008-04-23', '2008-04-24', '2008-04-25', '2008-04-26', '2008-04-27', '2008-04-28', '2008-04-29', '2008-04-30', '2008-05-01', '2008-05-02', '2008-05-03', '2008-05-04', '2008-05-05', '2008-05-06', '2008-05-07', '2008-05-08', '2008-05-09', '2008-05-10', '2008-05-11', '2008-05-12', '2008-05-13', '2008-05-14', '2008-05-15', '2008-05-16', '2008-05-17', '2008-05-18', '2008-05-19', '2008-05-20', '2008-05-21', '2008-05-22', '2008-05-23', '2008-05-24', '2008-05-25', '2008-05-26', '2008-05-27', '2008-05-28', '2008-05-29', '2008-05-30', '2008-05-31', '2008-06-01', '2008-06-02', '2008-06-03', '2008-06-04', '2008-06-05', '2008-06-06', '2008-06-07', '2008-06-08', '2008-06-09', '2008-06-10', '2008-06-11', '2008-06-12', '2008-06-13', '2008-06-14', '2008-06-15', '2008-06-16', '2008-06-17', '2008-06-18', '2008-06-19', '2008-06-20', '2008-06-21', '2008-06-22', '2008-06-23', '2008-06-24', '2008-06-25', '2008-06-26', '2008-06-27', '2008-06-28', '2008-06-29', '2008-06-30', '2008-07-01', '2008-07-02', '2008-07-03', '2008-07-04', '2008-07-05', '2008-07-06', '2008-07-07', '2008-07-08', '2008-07-09', '2008-07-10', '2008-07-11', '2008-07-12', '2008-07-13', '2008-07-14', '2008-07-15', '2008-07-16', '2008-07-17', '2008-07-18', '2008-07-19', '2008-07-20', '2008-07-21', '2008-07-22', '2008-07-23', '2008-07-24', '2008-07-25', '2008-07-26', '2008-07-27', '2008-07-28', '2008-07-29', '2008-07-30', '2008-07-31', '2008-08-01', '2008-08-02', '2008-08-03', '2008-08-04', '2008-08-05', '2008-08-06', '2008-08-07', '2008-08-08', '2008-08-09', '2008-08-10', '2008-08-11', '2008-08-12', '2008-08-13', '2008-08-14', '2008-08-15', '2008-08-16', '2008-08-17', '2008-08-18', '2008-08-19', '2008-08-20', '2008-08-21', '2008-08-22', '2008-08-23', '2008-08-24', '2008-08-25', '2008-08-26', '2008-08-27', '2008-08-28', '2008-08-29', '2008-08-30', '2008-08-31', '2008-09-01', '2008-09-02', '2008-09-03', '2008-09-04', '2008-09-05', '2008-09-06', '2008-09-07', '2008-09-08', '2008-09-09', '2008-09-10', '2008-09-11', '2008-09-12', '2008-09-13', '2008-09-14', '2008-09-15', '2008-09-16', '2008-09-17', '2008-09-18', '2008-09-19', '2008-09-20', '2008-09-21', '2008-09-22', '2008-09-23', '2008-09-24', '2008-09-25', '2008-09-26', '2008-09-27', '2008-09-28', '2008-09-29', '2008-09-30', '2008-10-01', '2008-10-02', '2008-10-03', '2008-10-04', '2008-10-05', '2008-10-06', '2008-10-07', '2008-10-08', '2008-10-09', '2008-10-10', '2008-10-11', '2008-10-12', '2008-10-13', '2008-10-14', '2008-10-15', '2008-10-16', '2008-10-17', '2008-10-18', '2008-10-19', '2008-10-20', '2008-10-21', '2008-10-22', '2008-10-23', '2008-10-24', '2008-10-25', '2008-10-26', '2008-10-27', '2008-10-28', '2008-10-29', '2008-10-30', '2008-10-31'], dtype=object)
# Bagged Decision Trees for Classification
from sklearn import cross_validation
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
dataframe.columns
Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow'], dtype='object')
del dataframe['Date']
del dataframe['Location']
del dataframe['WindDir9am']
del dataframe['WindSpeed3pm']
del dataframe['WindGustDir']
del dataframe['WindDir3pm']
del dataframe['RISK_MM']
dataframe=dataframe.replace(['Yes', 'No'], [1, 0]) #using replace to change string to numeric values
dataframe=dataframe.dropna()
dataframe.head()
MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustSpeed | WindSpeed9am | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 8.0 | 24.3 | 0.0 | 3.4 | 6.3 | 30.0 | 6.0 | 68 | 29 | 1019.7 | 1015.0 | 7 | 7 | 14.4 | 23.6 | 0 | 1 |
1 | 14.0 | 26.9 | 3.6 | 4.4 | 9.7 | 39.0 | 4.0 | 80 | 36 | 1012.4 | 1008.4 | 5 | 3 | 17.5 | 25.7 | 1 | 1 |
2 | 13.7 | 23.4 | 3.6 | 5.8 | 3.3 | 85.0 | 6.0 | 82 | 69 | 1009.5 | 1007.2 | 8 | 7 | 15.4 | 20.2 | 1 | 1 |
3 | 13.3 | 15.5 | 39.8 | 7.2 | 9.1 | 54.0 | 30.0 | 62 | 56 | 1005.5 | 1007.0 | 2 | 7 | 13.5 | 14.1 | 1 | 1 |
4 | 7.6 | 16.1 | 2.8 | 5.6 | 10.6 | 50.0 | 20.0 | 68 | 49 | 1018.3 | 1018.5 | 7 | 7 | 11.1 | 15.4 | 1 | 0 |
len(dataframe)
354
len(dataframe.columns)
17
names=dataframe.columns
names
Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow'], dtype='object')
dataframe.describe()
MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustSpeed | WindSpeed9am | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 354.000000 | 354.000000 | 354.000000 | 354.000000 | 354.000000 | 354.000000 | 354.000000 | 354.000000 | 354.000000 | 354.000000 | 354.000000 | 354.000000 | 354.000000 | 354.000000 | 354.000000 | 354.000000 | 354.000000 |
mean | 7.362429 | 20.601412 | 1.420904 | 4.558192 | 7.925424 | 40.011299 | 9.666667 | 71.875706 | 44.454802 | 1019.562147 | 1016.692090 | 3.920904 | 4.019774 | 12.438701 | 19.271469 | 0.180791 | 0.180791 |
std | 6.010927 | 6.708966 | 4.235358 | 2.667877 | 3.510039 | 13.034488 | 7.978489 | 13.161939 | 16.944316 | 6.602685 | 6.373679 | 2.962363 | 2.672312 | 5.630160 | 6.663681 | 0.385390 | 0.385390 |
min | -5.300000 | 7.600000 | 0.000000 | 0.200000 | 0.000000 | 13.000000 | 0.000000 | 36.000000 | 13.000000 | 996.500000 | 996.800000 | 0.000000 | 0.000000 | 0.100000 | 5.100000 | 0.000000 | 0.000000 |
25% | 2.400000 | 15.100000 | 0.000000 | 2.400000 | 5.925000 | 31.000000 | 6.000000 | 64.000000 | 32.000000 | 1015.225000 | 1012.725000 | 1.000000 | 1.000000 | 7.725000 | 14.300000 | 0.000000 | 0.000000 |
50% | 7.500000 | 19.750000 | 0.000000 | 4.200000 | 8.650000 | 39.000000 | 7.000000 | 72.000000 | 43.000000 | 1020.000000 | 1017.200000 | 4.000000 | 4.000000 | 12.600000 | 18.600000 | 0.000000 | 0.000000 |
75% | 12.500000 | 25.500000 | 0.200000 | 6.400000 | 10.600000 | 46.000000 | 13.000000 | 80.000000 | 54.750000 | 1024.400000 | 1021.350000 | 7.000000 | 7.000000 | 17.000000 | 24.000000 | 0.000000 | 0.000000 |
max | 20.900000 | 35.800000 | 39.800000 | 13.800000 | 13.600000 | 98.000000 | 41.000000 | 99.000000 | 96.000000 | 1035.700000 | 1033.200000 | 8.000000 | 8.000000 | 24.700000 | 34.500000 | 1.000000 | 1.000000 |
type(dataframe)
pandas.core.frame.DataFrame
array = dataframe.values
pd.value_counts(dataframe["RainTomorrow"])
0 290 1 64 Name: RainTomorrow, dtype: int64
array
array([[ 8. , 24.3, 0. , ..., 23.6, 0. , 1. ], [ 14. , 26.9, 3.6, ..., 25.7, 1. , 1. ], [ 13.7, 23.4, 3.6, ..., 20.2, 1. , 1. ], ..., [ 12.5, 19.9, 0. , ..., 18.3, 0. , 0. ], [ 12.5, 26.9, 0. , ..., 25.9, 0. , 0. ], [ 12.3, 30.2, 0. , ..., 28.6, 0. , 0. ]])
X = array[:,0:16]
Y = array[:,16]
num_folds = 10
num_instances = len(X)
seed = 7
type(X)
numpy.ndarray
X
array([[ 8. , 24.3, 0. , ..., 14.4, 23.6, 0. ], [ 14. , 26.9, 3.6, ..., 17.5, 25.7, 1. ], [ 13.7, 23.4, 3.6, ..., 15.4, 20.2, 1. ], ..., [ 12.5, 19.9, 0. , ..., 14.5, 18.3, 0. ], [ 12.5, 26.9, 0. , ..., 15.8, 25.9, 0. ], [ 12.3, 30.2, 0. , ..., 23.8, 28.6, 0. ]])
#Y[Y == "Yes"] = 1 An alternative way to make a NumPy arraye change values
#Y[Y == "No"] = 0
Y
array([ 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
dtr = tree.DecisionTreeRegressor(max_depth=3)
dtr.fit(X, Y)
DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best')
# from sklearn.metrics import roc_curve, auc
#!sudo pip install pydotplus
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
# http://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/
# http://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/
#!pip freeze
#checking if we have the right packages
#!pip install --upgrade pip
#!pip install pydotplus
import pydotplus as pydot
from IPython.display import Image
from sklearn.externals.six import StringIO
# Graphviz
#sudo add-apt-repository ppa:gviz-adm/graphviz-dev
# sudo apt-get update
# http://www.graphviz.org/Download_linux_ubuntu.php
dot_data = StringIO()
tree.export_graphviz(dtr, out_file=dot_data,feature_names=names[:-1])
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
model
BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best'), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=7, verbose=0, warm_start=False)
kfold
sklearn.cross_validation.KFold(n=354, n_folds=10, shuffle=False, random_state=7)
results = cross_validation.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
0.850873015873
results
array([ 0.75 , 0.86111111, 0.69444444, 0.88888889, 0.88571429, 0.82857143, 0.91428571, 0.85714286, 0.94285714, 0.88571429])