import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('bike_rental_hour.csv')
data.describe()
instant | season | yr | mnth | hr | holiday | weekday | workingday | weathersit | temp | atemp | hum | windspeed | casual | registered | cnt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 17379.0000 | 17379.000000 | 17379.000000 | 17379.000000 | 17379.000000 | 17379.000000 | 17379.000000 | 17379.000000 | 17379.000000 | 17379.000000 | 17379.000000 | 17379.000000 | 17379.000000 | 17379.000000 | 17379.000000 | 17379.000000 |
mean | 8690.0000 | 2.501640 | 0.502561 | 6.537775 | 11.546752 | 0.028770 | 3.003683 | 0.682721 | 1.425283 | 0.496987 | 0.475775 | 0.627229 | 0.190098 | 35.676218 | 153.786869 | 189.463088 |
std | 5017.0295 | 1.106918 | 0.500008 | 3.438776 | 6.914405 | 0.167165 | 2.005771 | 0.465431 | 0.639357 | 0.192556 | 0.171850 | 0.192930 | 0.122340 | 49.305030 | 151.357286 | 181.387599 |
min | 1.0000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.020000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
25% | 4345.5000 | 2.000000 | 0.000000 | 4.000000 | 6.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.340000 | 0.333300 | 0.480000 | 0.104500 | 4.000000 | 34.000000 | 40.000000 |
50% | 8690.0000 | 3.000000 | 1.000000 | 7.000000 | 12.000000 | 0.000000 | 3.000000 | 1.000000 | 1.000000 | 0.500000 | 0.484800 | 0.630000 | 0.194000 | 17.000000 | 115.000000 | 142.000000 |
75% | 13034.5000 | 3.000000 | 1.000000 | 10.000000 | 18.000000 | 0.000000 | 5.000000 | 1.000000 | 2.000000 | 0.660000 | 0.621200 | 0.780000 | 0.253700 | 48.000000 | 220.000000 | 281.000000 |
max | 17379.0000 | 4.000000 | 1.000000 | 12.000000 | 23.000000 | 1.000000 | 6.000000 | 1.000000 | 4.000000 | 1.000000 | 1.000000 | 1.000000 | 0.850700 | 367.000000 | 886.000000 | 977.000000 |
data.head(10)
instant | dteday | season | yr | mnth | hr | holiday | weekday | workingday | weathersit | temp | atemp | hum | windspeed | casual | registered | cnt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2011-01-01 | 1 | 0 | 1 | 0 | 0 | 6 | 0 | 1 | 0.24 | 0.2879 | 0.81 | 0.0000 | 3 | 13 | 16 |
1 | 2 | 2011-01-01 | 1 | 0 | 1 | 1 | 0 | 6 | 0 | 1 | 0.22 | 0.2727 | 0.80 | 0.0000 | 8 | 32 | 40 |
2 | 3 | 2011-01-01 | 1 | 0 | 1 | 2 | 0 | 6 | 0 | 1 | 0.22 | 0.2727 | 0.80 | 0.0000 | 5 | 27 | 32 |
3 | 4 | 2011-01-01 | 1 | 0 | 1 | 3 | 0 | 6 | 0 | 1 | 0.24 | 0.2879 | 0.75 | 0.0000 | 3 | 10 | 13 |
4 | 5 | 2011-01-01 | 1 | 0 | 1 | 4 | 0 | 6 | 0 | 1 | 0.24 | 0.2879 | 0.75 | 0.0000 | 0 | 1 | 1 |
5 | 6 | 2011-01-01 | 1 | 0 | 1 | 5 | 0 | 6 | 0 | 2 | 0.24 | 0.2576 | 0.75 | 0.0896 | 0 | 1 | 1 |
6 | 7 | 2011-01-01 | 1 | 0 | 1 | 6 | 0 | 6 | 0 | 1 | 0.22 | 0.2727 | 0.80 | 0.0000 | 2 | 0 | 2 |
7 | 8 | 2011-01-01 | 1 | 0 | 1 | 7 | 0 | 6 | 0 | 1 | 0.20 | 0.2576 | 0.86 | 0.0000 | 1 | 2 | 3 |
8 | 9 | 2011-01-01 | 1 | 0 | 1 | 8 | 0 | 6 | 0 | 1 | 0.24 | 0.2879 | 0.75 | 0.0000 | 1 | 7 | 8 |
9 | 10 | 2011-01-01 | 1 | 0 | 1 | 9 | 0 | 6 | 0 | 1 | 0.32 | 0.3485 | 0.76 | 0.0000 | 8 | 6 | 14 |
plt.figure(figsize = (10,10))
plt.hist(data['cnt'], bins= 50)
plt.xlabel('Number of Bike Rentals')
plt.ylabel('Frequency of Value')
plt.show()
data['cnt'].value_counts()
#Displays the value in the count row, followed by the frequency of those values#
5 260 6 236 4 231 3 224 2 208 7 198 8 182 1 158 10 155 11 147 9 128 12 122 13 113 16 105 17 102 14 97 20 91 21 90 26 90 15 89 28 85 23 81 18 79 24 76 31 75 25 72 27 71 19 69 32 68 36 67 ... 754 1 977 1 913 1 897 1 865 1 849 1 708 1 804 1 792 1 948 1 728 1 696 1 967 1 863 1 871 1 775 1 727 1 870 1 854 1 806 1 774 1 758 1 630 1 917 1 901 1 725 1 709 1 661 1 629 1 887 1 Name: cnt, Length: 869, dtype: int64
data['cnt'].sum()
3292679
data.corr()['cnt'].sort_values()
hum -0.322911 weathersit -0.142426 holiday -0.030927 weekday 0.026900 workingday 0.030284 windspeed 0.093234 mnth 0.120638 season 0.178056 yr 0.250495 instant 0.278379 hr 0.394071 atemp 0.400929 temp 0.404772 casual 0.694564 registered 0.972151 cnt 1.000000 Name: cnt, dtype: float64
data.corr()['registered'].sort_values().drop('casual')
hum -0.273933 weathersit -0.120966 holiday -0.047345 weekday 0.021578 windspeed 0.082321 mnth 0.122273 workingday 0.134326 season 0.174226 yr 0.253684 instant 0.282046 atemp 0.332559 temp 0.335361 hr 0.374141 cnt 0.972151 registered 1.000000 Name: registered, dtype: float64
data.corr()['casual'].sort_values().drop('registered')
hum -0.347028 workingday -0.300942 weathersit -0.152628 holiday 0.031564 weekday 0.032721 mnth 0.068457 windspeed 0.090287 season 0.120206 yr 0.142779 instant 0.158295 hr 0.301202 atemp 0.454080 temp 0.459616 cnt 0.694564 casual 1.000000 Name: casual, dtype: float64
(data.corr()['casual'].sort_values().drop('registered')
- data.corr()['registered'].sort_values().drop('casual')).sort_values()
workingday -0.435268 cnt -0.277587 instant -0.123750 yr -0.110906 hum -0.073095 hr -0.072939 season -0.054019 mnth -0.053816 weathersit -0.031662 windspeed 0.007966 weekday 0.011144 holiday 0.078909 atemp 0.121521 temp 0.124255 casual NaN registered NaN dtype: float64
data['registered'].sum() - data['casual'].sum()
2052645
hour_to_counts = data.groupby(by = 'hr')['cnt'].mean().sort_values(ascending = False)
hour_to_counts
hr 17 461.452055 18 425.510989 8 359.011004 16 311.983562 19 311.523352 13 253.661180 12 253.315934 15 251.233196 14 240.949246 20 226.030220 9 219.309491 7 212.064649 11 208.143054 10 173.668501 21 172.314560 22 131.335165 23 87.831044 6 76.044138 0 53.898072 1 33.375691 2 22.869930 5 19.889819 3 11.727403 4 6.352941 Name: cnt, dtype: float64
plt.xticks(np.arange(0, 23, 2))
plt.scatter(x= hour_to_counts.index , y= hour_to_counts.values)
plt.xlabel('Hour Of The Day')
plt.ylabel('Average Number Of Users')
plt.show()
violin_data = []
for i in range(0,24):
violin_data.append(data[data['hr'] == i]['cnt'])
plt.violinplot(violin_data)
plt.xlabel('Hour Of The Day')
plt.ylabel('Number Of Users')
plt.show()
bins = [-1, 6, 11, 15, 19, 24]
labels = [1, 3, 4, 5, 2]
data['hr_label'] = pd.cut(data['hr'], bins=bins, labels=labels)
data[['hr','hr_label']][:25]
hr | hr_label | |
---|---|---|
0 | 0 | 1 |
1 | 1 | 1 |
2 | 2 | 1 |
3 | 3 | 1 |
4 | 4 | 1 |
5 | 5 | 1 |
6 | 6 | 1 |
7 | 7 | 3 |
8 | 8 | 3 |
9 | 9 | 3 |
10 | 10 | 3 |
11 | 11 | 3 |
12 | 12 | 4 |
13 | 13 | 4 |
14 | 14 | 4 |
15 | 15 | 4 |
16 | 16 | 5 |
17 | 17 | 5 |
18 | 18 | 5 |
19 | 19 | 5 |
20 | 20 | 2 |
21 | 21 | 2 |
22 | 22 | 2 |
23 | 23 | 2 |
24 | 0 | 1 |
plt.xticks(np.arange(0, 23, 2))
plt.scatter(x= hour_to_counts.index , y= hour_to_counts.values)
plt.xlabel('Hour Of The Day')
plt.ylabel('Average Number Of Users')
plt.axvline(x = 6.5)
plt.axvline(x = 11.5)
plt.axvline(x = 15.5)
plt.axvline(x = 19.5)
plt.show()
#Need to convert our bin numbers to int before running correlation#
data['hr_label'] = data['hr_label'].astype(int)
print(data.corr()['cnt']['hr'])
print(data.corr()['cnt']['hr_label'])
0.39407149778293477 0.645164760162967
#For each month find the average count#
month_to_counts = data.groupby(by = 'mnth')['cnt'].mean().sort_values(ascending = False)
month_to_counts
mnth 9 240.773138 6 240.515278 8 238.097627 7 231.819892 5 222.907258 10 222.158511 4 187.260960 11 177.335421 3 155.410726 12 142.303439 2 112.865026 1 94.424773 Name: cnt, dtype: float64
month_data = []
for i in range(1,13):
month_data.append(data[data['mnth'] == i]['cnt'])
plt.violinplot(month_data)
plt.xlabel('Month')
plt.ylabel('Number Of Users')
plt.show()
plt.xticks(np.arange(0, 13, 1))
plt.scatter(x= month_to_counts.index , y= month_to_counts.values)
plt.xlabel('Month')
plt.ylabel('Average Number Of Users')
plt.show()
month_to_season = data.groupby(by = 'mnth')['season'].min().sort_values(ascending = False)
month_to_season
mnth 11 4 10 4 9 3 8 3 7 3 6 2 5 2 4 2 12 1 3 1 2 1 1 1 Name: season, dtype: int64
plt.xticks(np.arange(0, 12, 1))
plt.scatter(x= month_to_counts.index , y= month_to_counts.values)
plt.xlabel('Month')
plt.ylabel('Average Number Of Users')
plt.axvline(x = 3.5)
plt.axvline(x = 6.5)
plt.axvline(x = 9.5)
plt.axvline(x = 11.5)
plt.show()
plt.xticks(np.arange(0, 12, 1))
plt.scatter(x= month_to_counts.index , y= month_to_counts.values)
plt.xlabel('Month')
plt.ylabel('Average Number Of Users')
plt.axhline(y = 120)
plt.axhline(y = 200)
plt.show()
bins = [-1, 2, 4, 10, 12]
labels = [1, 2, 3, 4]
data['high_low_season_label'] = pd.cut(data['mnth'], bins=bins, labels=labels)
data['high_low_season_label'].replace(4,2, inplace= True)
data[['mnth','high_low_season_label']].sample(5)
mnth | high_low_season_label | |
---|---|---|
3975 | 6 | 3 |
9757 | 2 | 1 |
1548 | 3 | 2 |
3407 | 5 | 3 |
7052 | 10 | 3 |
data['high_low_season_label'] = data['high_low_season_label'].astype(int)
print(data.corr()['cnt']['mnth'])
print(data.corr()['cnt']['high_low_season_label'])
0.12063776021314714 0.2654581805348883
weather_columns = ['weathersit', 'atemp', 'hum', 'windspeed']
for column in weather_columns:
violin_data = []
for value in (data[column].unique()):
violin_data.append(data[data[column] == value]['cnt'])
plt.violinplot(violin_data)
plt.xlabel('{} column values'.format(column))
plt.ylabel('Number Of Users')
plt.show()
weather_columns = ['weathersit', 'atemp', 'hum', 'windspeed']
for column in weather_columns:
weather_to_counts = data.groupby(by = column)['cnt'].mean().sort_values(ascending = False)
plt.scatter(x = weather_to_counts.index, y = weather_to_counts.values)
plt.xlabel('{} column'.format(column))
plt.ylabel('Average Number Of Users')
plt.show()
plt.scatter(x = data['atemp'], y=data['cnt'], alpha = 0.015)
<matplotlib.collections.PathCollection at 0x7f382faed320>
plt.hist(data['atemp'], bins = 100)
plt.show()
data.groupby(by = 'atemp')['cnt'].count().sort_values(ascending= False).head(5)
atemp 0.6212 988 0.5152 618 0.4091 614 0.3333 600 0.6667 593 Name: cnt, dtype: int64
data.groupby(by = 'temp')['cnt'].count().sort_values(ascending= False).head(5)
temp 0.62 726 0.66 693 0.64 692 0.70 690 0.60 675 Name: cnt, dtype: int64
plt.hist(data['temp'], bins = 100)
plt.show()
day_type_columns = ['weekday', 'workingday', 'holiday']
for column in day_type_columns:
day_to_counts = data.groupby(by = column)['cnt'].mean().sort_values(ascending = False)
plt.scatter(x = day_to_counts.index, y = day_to_counts.values)
plt.xlabel('{} column'.format(column))
plt.ylabel('Average Number Of Users')
plt.show()
data.groupby(by = 'weekday')[['cnt','registered','casual']].mean().sort_values(by = 'cnt', ascending = False)
cnt | registered | casual | |
---|---|---|---|
weekday | |||
4 | 196.436665 | 171.564144 | 24.872521 |
5 | 196.135907 | 164.677121 | 31.458786 |
2 | 191.238891 | 167.658377 | 23.580514 |
3 | 191.130505 | 167.971313 | 23.159192 |
6 | 190.209793 | 128.962978 | 61.246815 |
1 | 183.744655 | 155.191206 | 28.553449 |
0 | 177.468825 | 121.305356 | 56.163469 |
predictor_columns = ['hr_label','mnth','season','weathersit', 'temp', 'hum', 'windspeed', 'workingday', 'weekday', 'holiday', 'high_low_season_label']
linear_model = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(data[predictor_columns], data['cnt'], test_size = 0.2, random_state = 0)
linear_model.fit(X_train, y_train)
predictions = linear_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(mse)
15915.201495043528
fig, ax = plt.subplots()
ax.scatter(x= y_test, y= predictions, alpha = 0.1)
plt.xlabel('Original Test Values')
plt.ylabel('Predicted Test Values')
plt.show()
linear_errors = pd.DataFrame(y_test, columns = ['cnt'])
linear_errors['predicted'] = predictions
linear_errors['difference'] = np.abs(linear_errors['predicted'] - linear_errors['cnt'])
linear_errors.sort_values(by='difference', ascending = False).head()
cnt | predicted | difference | |
---|---|---|---|
15579 | 835 | 256.373186 | 578.626814 |
10622 | 957 | 385.357779 | 571.642221 |
16621 | 729 | 160.773290 | 568.226710 |
14773 | 977 | 424.066594 | 552.933406 |
15075 | 805 | 254.468428 | 550.531572 |
linear_errors.sort_values(by='predicted', ascending = False).head()
cnt | predicted | difference | |
---|---|---|---|
12971 | 351 | 497.146389 | 146.146389 |
12947 | 387 | 482.399356 | 95.399356 |
4794 | 221 | 481.391966 | 260.391966 |
12779 | 320 | 480.813954 | 160.813954 |
14483 | 528 | 477.841333 | 50.158667 |
linear_errors.sort_values(by='predicted').head()
cnt | predicted | difference | |
---|---|---|---|
9650 | 24 | -132.877317 | 156.877317 |
17212 | 13 | -117.861548 | 130.861548 |
9150 | 2 | -111.586228 | 113.586228 |
9125 | 13 | -99.203609 | 112.203609 |
9485 | 17 | -98.592887 | 115.592887 |
tree_regress = DecisionTreeRegressor(min_samples_split =2)
tree_regress.fit(X_train, y_train)
tree_predictions = tree_regress.predict(X_test)
mse_basic_tree = mean_squared_error(y_test, tree_predictions)
print(mse_basic_tree)
16323.717143268124
fig, ax = plt.subplots()
ax.scatter(x= y_test, y= tree_predictions, alpha = 0.1)
plt.xlabel('Original Test Values')
plt.ylabel('Predicted Test Values')
plt.show()
tree_errors = pd.DataFrame(y_test, columns = ['cnt'])
tree_errors['predicted'] = tree_predictions
tree_errors['difference'] = np.abs(tree_errors['predicted'] - tree_errors['cnt'])
tree_errors.sort_values(by='difference', ascending = False).head()
cnt | predicted | difference | |
---|---|---|---|
15890 | 115 | 806.0 | 691.0 |
15723 | 728 | 101.0 | 627.0 |
2365 | 61 | 677.0 | 616.0 |
15075 | 805 | 193.0 | 612.0 |
1846 | 165 | 762.0 | 597.0 |
tree_errors.sort_values(by='predicted', ascending = False).head()
cnt | predicted | difference | |
---|---|---|---|
14243 | 467 | 970.0 | 503.0 |
15781 | 858 | 963.0 | 105.0 |
15587 | 446 | 943.0 | 497.0 |
15731 | 524 | 922.0 | 398.0 |
13934 | 686 | 891.0 | 205.0 |
def DecisionTreeCreator(a, b, c, d):
X_train, X_test, y_train, y_test = train_test_split(data[predictor_columns], data['cnt'], test_size = 0.2, random_state = 0)
tree_regress = DecisionTreeRegressor(max_depth = a, min_samples_split = b,
min_samples_leaf = c, max_leaf_nodes = d)
tree_regress.fit(X_train, y_train)
tree_predictions = tree_regress.predict(X_test)
mse_tree = mean_squared_error(y_test, tree_predictions)
tree_errors = pd.DataFrame(y_test, columns = ['cnt'])
tree_errors['predicted'] = tree_predictions
tree_errors['difference'] = np.abs(tree_errors['predicted'] - tree_errors['cnt'])
temp_tree_dataframe = tree_errors.sort_values(by='difference', ascending = False).head()
print('For variables {}, {} ,{}, {} the mean squared error was {}'.format(a,b,c,d,mse_tree))
#Random seed used for repeatability only#
np.random.seed(1)
for i in range(5):
#create 5 random integers between 2 and 10#
a, b, c, d = np.random.randint(2, 10, size = 4)
DecisionTreeCreator(a, b, c, d)
For variables 7, 5 ,6, 2 the mean squared error was 22928.326114677056 For variables 3, 5 ,7, 9 the mean squared error was 14337.975222668976 For variables 2, 3 ,6, 9 the mean squared error was 14969.137903755653 For variables 6, 8 ,3, 4 the mean squared error was 17516.58970607434 For variables 8, 7 ,4, 6 the mean squared error was 15474.589804754429
np.random.seed(1)
for i in range(7):
#create 5 random integers between 2 and 10#
a,b,c = 3,3,3
d = 13 + i
DecisionTreeCreator(a, b, c, d)
For variables 3, 3 ,3, 13 the mean squared error was 13942.516096029993 For variables 3, 3 ,3, 14 the mean squared error was 13929.106051338153 For variables 3, 3 ,3, 15 the mean squared error was 13921.030558127797 For variables 3, 3 ,3, 16 the mean squared error was 13913.653646822506 For variables 3, 3 ,3, 17 the mean squared error was 13913.653646822506 For variables 3, 3 ,3, 18 the mean squared error was 13913.653646822506 For variables 3, 3 ,3, 19 the mean squared error was 13913.653646822506
np.random.seed(1)
for i in range(5):
#create 10 random integers between 2 and 10#
a, b, c = np.random.randint(2, 2000, size = 3)
d = 16
DecisionTreeCreator(a, b, c, d)
For variables 1063, 237 ,1934, 16 the mean squared error was 17988.426516454183 For variables 1793, 907 ,717, 16 the mean squared error was 14309.817118612249 For variables 849, 962 ,146, 16 the mean squared error was 13616.790650461153 For variables 1998, 1609 ,751, 16 the mean squared error was 14319.571898943967 For variables 1416, 1307 ,1204, 16 the mean squared error was 15394.119979668272
random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)
random_forest_predictions = random_forest.predict(X_test)
mse_random_forest = mean_squared_error(y_test, random_forest_predictions)
print(mse_random_forest)
10169.928235267103
random_forest_errors = pd.DataFrame(y_test, columns = ['cnt'])
random_forest_errors['predicted'] = random_forest_predictions
random_forest_errors['difference'] = np.abs(random_forest_errors['predicted'] - random_forest_errors['cnt'])
print(random_forest_errors.sort_values(by='difference', ascending = False).head())
cnt predicted difference 15315 785 199.7 585.3 15579 835 274.1 560.9 13587 788 238.9 549.1 14427 808 260.3 547.7 14619 646 138.2 507.8
fig, ax = plt.subplots()
ax.scatter(x= y_test, y= random_forest_predictions, alpha = 0.1)
plt.title('Random Forest Results')
plt.xlabel('Original Test Values')
plt.ylabel('Predicted Test Values')
plt.show()
fig, ax = plt.subplots()
ax.scatter(x= y_test, y= tree_predictions, alpha = 0.1)
plt.title('Decision Tree Results')
plt.xlabel('Original Test Values')
plt.ylabel('Predicted Test Values')
plt.show()
casual_linear_predictions = np.empty(shape = predictions.shape)
registered_linear_predictions = np.empty(shape = predictions.shape)
for column in ['casual', 'registered']:
X_train, X_test, y_train, y_test = train_test_split(data[predictor_columns], data[column], test_size = 0.2, random_state = 0)
linear_model.fit(X_train, y_train)
if column == 'casual':
casual_linear_predictions = linear_model.predict(X_test)
else:
registered_linear_predictions = linear_model.predict(X_test)
linear_means = np.mean([[casual_linear_predictions], [registered_linear_predictions]], axis =0)
mse_dual_linear = mean_squared_error(y_test, linear_means[0])
mse_dual_linear
17634.65455743989
fig, ax = plt.subplots()
ax.scatter(x= y_test, y= linear_means[0], alpha = 0.1)
plt.show()
casual_linear_predictions_no_negatives = np.empty(shape = predictions.shape)
registered_linear_predictions_no_negatives = np.empty(shape = predictions.shape)
for column in ['casual', 'registered']:
X_train, X_test, y_train, y_test = train_test_split(data[predictor_columns], data[column], test_size = 0.2, random_state = 0)
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
if column == 'casual':
casual_linear_predictions_no_negatives = linear_model.predict(X_test)
else:
registered_linear_predictions_no_negatives = linear_model.predict(X_test)
# Replace negative predictions with 0#
casual_linear_predictions_no_negatives[casual_linear_predictions_no_negatives<0] = 0
registered_linear_predictions_no_negatives[registered_linear_predictions_no_negatives<0] = 0
linear_means_no_negatives = np.mean([[casual_linear_predictions_no_negatives], [registered_linear_predictions_no_negatives]], axis =0)
mse_dual_linear_no_negatives = mean_squared_error(y_test, linear_means_no_negatives[0])
mse_dual_linear_no_negatives
17543.03955009062
fig, ax = plt.subplots()
ax.scatter(x= y_test, y= linear_means_no_negatives[0], alpha = 0.1)
plt.show()