import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain', nrows=6)
cols = ['Fare', 'Embarked', 'Sex']
X = df[cols]
y = df['Survived']
from sklearn.model_selection import train_test_split
X
Fare | Embarked | Sex | |
---|---|---|---|
0 | 7.2500 | S | male |
1 | 71.2833 | C | female |
2 | 7.9250 | S | female |
3 | 53.1000 | S | female |
4 | 8.0500 | S | male |
5 | 8.4583 | Q | male |
# any positive integer can be used for the random_state value
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
X_train
Fare | Embarked | Sex | |
---|---|---|---|
0 | 7.2500 | S | male |
3 | 53.1000 | S | female |
5 | 8.4583 | Q | male |
# using the SAME random_state value results in the SAME random split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
X_train
Fare | Embarked | Sex | |
---|---|---|---|
0 | 7.2500 | S | male |
3 | 53.1000 | S | female |
5 | 8.4583 | Q | male |
# using a DIFFERENT random_state value results in a DIFFERENT random split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2)
X_train
Fare | Embarked | Sex | |
---|---|---|---|
2 | 7.9250 | S | female |
5 | 8.4583 | Q | male |
0 | 7.2500 | S | male |
© 2020 Data School. All rights reserved.