import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.set_printoptions(precision=3, suppress=True)
dataset = pd.read_csv('datafile.csv')
dataset
Position | Level | Salary | |
---|---|---|---|
0 | Business Analyst | 1 | 45000 |
1 | Junior Consultant | 2 | 50000 |
2 | Senior Consultant | 3 | 60000 |
3 | Manager | 4 | 80000 |
4 | Country Manager | 5 | 110000 |
5 | Region Manager | 6 | 150000 |
6 | Partner | 7 | 200000 |
7 | Senior Partner | 8 | 300000 |
8 | C-level | 9 | 500000 |
9 | CEO | 10 | 1000000 |
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=300, random_state=0)
regressor.fit(X, y)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False)
regressor.predict(5)
array([106933.333])
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color='red')
plt.plot(X_grid, regressor.predict(X_grid), color='blue')
plt.title('Truth or Bluff (Random Forest Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()