# Pandas is used for data manipulation
import pandas as pd
# Read in data as pandas dataframe and display first 5 rows
features = pd.read_csv('data/temps.csv')
features.head(5)
year | month | day | week | temp_2 | temp_1 | average | actual | friend | |
---|---|---|---|---|---|---|---|---|---|
0 | 2016 | 1 | 1 | Fri | 45 | 45 | 45.6 | 45 | 29 |
1 | 2016 | 1 | 2 | Sat | 44 | 45 | 45.7 | 44 | 61 |
2 | 2016 | 1 | 3 | Sun | 45 | 44 | 45.8 | 41 | 56 |
3 | 2016 | 1 | 4 | Mon | 44 | 41 | 45.9 | 40 | 53 |
4 | 2016 | 1 | 5 | Tues | 41 | 40 | 46.0 | 44 | 41 |
features_list = list(features.columns)
features_list
['year', 'month', 'day', 'week', 'temp_2', 'temp_1', 'average', 'actual', 'friend']
features_list.index('temp_1')
5
print('The shape of our features is:', features.shape)
The shape of our features is: (348, 9)
# Descriptive statistics for each column
features.describe()
year | month | day | temp_2 | temp_1 | average | actual | friend | |
---|---|---|---|---|---|---|---|---|
count | 348.0 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 |
mean | 2016.0 | 6.477011 | 15.514368 | 62.511494 | 62.560345 | 59.760632 | 62.543103 | 60.034483 |
std | 0.0 | 3.498380 | 8.772982 | 11.813019 | 11.767406 | 10.527306 | 11.794146 | 15.626179 |
min | 2016.0 | 1.000000 | 1.000000 | 35.000000 | 35.000000 | 45.100000 | 35.000000 | 28.000000 |
25% | 2016.0 | 3.000000 | 8.000000 | 54.000000 | 54.000000 | 49.975000 | 54.000000 | 47.750000 |
50% | 2016.0 | 6.000000 | 15.000000 | 62.500000 | 62.500000 | 58.200000 | 62.500000 | 60.000000 |
75% | 2016.0 | 10.000000 | 23.000000 | 71.000000 | 71.000000 | 69.025000 | 71.000000 | 71.000000 |
max | 2016.0 | 12.000000 | 31.000000 | 92.000000 | 92.000000 | 77.400000 | 92.000000 | 95.000000 |
无异常数据,可以直接用啦
# Use datetime for dealing with dates
import datetime
# Get years, months, and days
years = features['year']
months = features['month']
days = features['day']
# List and then convert to datetime object
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]
dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in dates]
dates[:5]
[datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0), datetime.datetime(2016, 1, 4, 0, 0), datetime.datetime(2016, 1, 5, 0, 0)]
# Import matplotlib for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt
%matplotlib inline
# Set the style
plt.style.use('fivethirtyeight')
# Set up the plotting layout
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize = (10,10))
fig.autofmt_xdate(rotation = 45)
# Actual max temperature measurement
ax1.plot(dates, features['actual'])
ax1.set_xlabel(''); ax1.set_ylabel('Temperature'); ax1.set_title('Max Temp')
# Temperature from 1 day ago
ax2.plot(dates, features['temp_1'])
ax2.set_xlabel(''); ax2.set_ylabel('Temperature'); ax2.set_title('Previous Max Temp')
# Temperature from 2 days ago
ax3.plot(dates, features['temp_2'])
ax3.set_xlabel('Date'); ax3.set_ylabel('Temperature'); ax3.set_title('Two Days Prior Max Temp')
# Friend Estimate
ax4.plot(dates, features['friend'])
ax4.set_xlabel('Date'); ax4.set_ylabel('Temperature'); ax4.set_title('Friend Estimate')
plt.tight_layout(pad=2)
原始数据:
week |
---|
Mon |
Tue |
Wed |
Thu |
Fri |
编码转换后:
Mon | Tue | Wed | Thu | Fri |
---|---|---|---|---|
1 | 0 | 0 | 0 | 0 |
0 | 1 | 0 | 0 | 0 |
0 | 0 | 1 | 0 | 0 |
0 | 0 | 0 | 1 | 0 |
0 | 0 | 0 | 0 | 1 |
# One-hot encode categorical features
features = pd.get_dummies(features)
features.head(5)
year | month | day | temp_2 | temp_1 | average | actual | friend | week_Fri | week_Mon | week_Sat | week_Sun | week_Thurs | week_Tues | week_Wed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016 | 1 | 1 | 45 | 45 | 45.6 | 45 | 29 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 2016 | 1 | 2 | 44 | 45 | 45.7 | 44 | 61 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
2 | 2016 | 1 | 3 | 45 | 44 | 45.8 | 41 | 56 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | 2016 | 1 | 4 | 44 | 41 | 45.9 | 40 | 53 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
4 | 2016 | 1 | 5 | 41 | 40 | 46.0 | 44 | 41 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
print('Shape of features after one-hot encoding:', features.shape)
Shape of features after one-hot encoding: (348, 15)
# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict
labels = np.array(features['actual'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('actual', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25,
random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
Training Features Shape: (261, 14) Training Labels Shape: (261,) Testing Features Shape: (87, 14) Testing Labels Shape: (87,)
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model
rf = RandomForestRegressor(n_estimators= 1000, random_state=42)
# Train the model on training data
rf.fit(train_features, train_labels);
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
Mean Absolute Error: 3.83 degrees.
mape( mean absolute percent error)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')
Accuracy: 93.99 %.
rf_new = RandomForestRegressor(n_estimators = 100, criterion = 'mse', max_depth = None,
min_samples_split = 2, min_samples_leaf = 1)
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot #pip install pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png');
print('The depth of this tree is:', tree.tree_.max_depth)
The depth of this tree is: 15
还是小一点吧。。。
# Limit depth of tree to 2 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3, random_state=42)
rf_small.fit(train_features, train_labels)
# Extract the small tree
tree_small = rf_small.estimators_[5]
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png');
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];
Variable: temp_1 Importance: 0.7 Variable: average Importance: 0.19 Variable: day Importance: 0.03 Variable: temp_2 Importance: 0.02 Variable: friend Importance: 0.02 Variable: month Importance: 0.01 Variable: year Importance: 0.0 Variable: week_Fri Importance: 0.0 Variable: week_Mon Importance: 0.0 Variable: week_Sat Importance: 0.0 Variable: week_Sun Importance: 0.0 Variable: week_Thurs Importance: 0.0 Variable: week_Tues Importance: 0.0 Variable: week_Wed Importance: 0.0
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 1000, random_state=42)
# Extract the two most important features
important_indices = [feature_list.index('temp_1'), feature_list.index('average')]
train_important = train_features[:, important_indices]
test_important = test_features[:, important_indices]
# Train the random forest
rf_most_important.fit(train_important, train_labels)
# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)
errors = abs(predictions - test_labels)
# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = np.mean(100 * (errors / test_labels))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')
Mean Absolute Error: 3.9 degrees. Accuracy: 93.8 %.
# list of x locations for plotting
x_values = list(range(len(importances)))
# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')
# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');
# Dates of training values
months = features[:, feature_list.index('month')]
days = features[:, feature_list.index('day')]
years = features[:, feature_list.index('year')]
# List and then convert to datetime object
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]
dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in dates]
# Dataframe with true values and dates
true_data = pd.DataFrame(data = {'date': dates, 'actual': labels})
# Dates of predictions
months = test_features[:, feature_list.index('month')]
days = test_features[:, feature_list.index('day')]
years = test_features[:, feature_list.index('year')]
# Column of dates
test_dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]
# Convert to datetime objects
test_dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in test_dates]
# Dataframe with predictions and dates
predictions_data = pd.DataFrame(data = {'date': test_dates, 'prediction': predictions})
# Plot the actual values
plt.plot(true_data['date'], true_data['actual'], 'b-', label = 'actual')
# Plot the predicted values
plt.plot(predictions_data['date'], predictions_data['prediction'], 'ro', label = 'prediction')
plt.xticks(rotation = '60');
plt.legend()
# Graph labels
plt.xlabel('Date'); plt.ylabel('Maximum Temperature (F)'); plt.title('Actual and Predicted Values');