In [2]:
# First let's import the dataset, using Pandas.
import pandas as pd

train = pd.read_csv("train.csv")    # make sure you're in the right directory if using iPython!

train.head()             # ignore the first column, it's how I split the data.

Out[2]:
class petal_length petal_width sepal_length sepal_width
0 Iris-virginica 5.5 1.8 6.4 3.1
1 Iris-virginica 5.9 2.3 6.8 3.2
2 Iris-virginica 5.4 2.3 6.2 3.4
3 Iris-virginica 4.8 1.8 6.0 3.0
4 Iris-virginica 5.1 2.3 6.9 3.1
In [3]:
from sklearn.ensemble import RandomForestClassifier

# however, are data has to be in a numpy array in order for the random forest algorithm to except it!
cols = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']
colsRes = ['class']
trainArr = train.as_matrix(cols)    # training array
trainRes = train.as_matrix(colsRes) # training results

## Training!

rf = RandomForestClassifier(n_estimators=100)    # 100 decision trees is a good enough number
rf.fit(trainArr, trainRes)          # finally, we fit the data to the algorithm!!! :)

# note - you might get an warning saying you entered a 2 column vector..ignore it.

/Users/alexwoods/Downloads/ipython-3.2.0/IPython/kernel/__main__.py:16: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

Out[3]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
In [5]:
## Testing!

# put the test results in the same format!
testArr = test.as_matrix(cols)

results = rf.predict(testArr)

# something I like to do is to add it back to the dataframe, so I can compare side-by-side
test['predictions'] = results